#!/bin/sh
exec ${GUILE-guile} -e '(ttn-do concordance)' -s $0 "$@" # -*-scheme-*-
!#
;;; concordance

;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
;;   2006, 2007, 2009, 2010 Thien-Thi Nguyen
;;
;; This file is part of ttn-do, released under the terms of the
;; GNU General Public License as published by the Free Software
;; Foundation; either version 3, or (at your option) any later
;; version.  There is NO WARRANTY.  See file COPYING for details.

;;; Commentary:

;; Usage: concordance [options] MBOX...
;;
;; Display concordance of MBOX ..., each a Unix-mbox format file.
;; Options:
;;
;;  -q, --quiet            -- don't report progress
;;  -x, --exclude WORDS    -- ignore WORDS, a comma-separated list of words
;;  -i, --case-insensitive -- downcase words on read
;;  -m, --minimum N        -- ignore words with length < N (default 1)
;;  -M, --maximum N        -- ignore words with length > N (default 59)
;;
;; Only the mail message bodies are scanned.

;;; Code:

(define-module (ttn-do concordance)
  #:export (main)
  #:use-module ((ttn-do zzz banalities) #:select (check-hv
                                                  qop<-args))
  #:use-module ((srfi srfi-13) #:select (string-tokenize))
  #:use-module ((srfi srfi-14) #:select (char-set-complement
                                         char-set))
  #:use-module ((ice-9 rdelim) #:select (read-line))
  #:use-module ((ice-9 regex) #:select (match:substring
                                        match:start
                                        match:end))
  #:use-module ((ttn-do zzz personally) #:select (FE fso)))

(define *c* (make-hash-table 99991))    ; concordance
(define *x* (make-hash-table 997))      ; exclude
(define *o* '((report-progress? . #t)   ; --quiet disables
              (min-word-length . 1)     ; set by --minimum
              (max-word-length . 59)    ; set by --maximum
              (case-insensitive . #f))) ; --case-insensitive enables

(define (set-exclude-list vals)
  (FE (string-tokenize vals (char-set-complement (char-set #\,)))
      (lambda (one-x)
        (hashq-create-handle! *x* (string->symbol one-x) #t))))

(define (opt option . newval)
  (if (null? newval)
      (assq-ref *o* option)
      (cond ((assq option *o*)
             => (lambda (cell)
                  (set-cdr! cell (car newval)))))))

(define (set-min-word-length s)
  (opt 'min-word-length (string->number s)))

(define (set-max-word-length s)
  (opt 'max-word-length (string->number s)))

(define *word-rx* (make-regexp "[-_'a-zA-Z][-_'a-zA-Z0-9]*"))
(define *from-rx* (make-regexp "^From "))

(define (parse-file file)               ; very cursorily :-/
  (let* ((minlen (opt 'min-word-length))
         (maxlen (opt 'max-word-length))
         (munge (if (opt 'case-insensitive)
                    (lambda (m)
                      (string->symbol (string-downcase (match:substring m))))
                    (lambda (m)
                      (string->symbol (match:substring m)))))
         (port (open-input-file file))
         (state #f)                     ; #f (pre From), headers, body
         (count 0))

    (define (grok-line line)
      (let loop ((start 0))
        (and=> (regexp-exec *word-rx* line start)
               (lambda (m)
                 (let* ((end (match:end m))
                        (len (- end (match:start m))))
                   (cond ((> minlen len))
                         ((< maxlen len))
                         (else
                          (let ((w (munge m)))
                            (cond ((hashq-get-handle *x* w))
                                  ((hashq-get-handle *c* w)
                                   => (lambda (h)
                                        (set-cdr! h (1+ (cdr h)))))
                                  (else
                                   (hashq-create-handle! *c* w 1))))))
                   (loop end))))))

    (define (next)
      (read-line port))

    (define (count-maybe!)
      (and state (set! count (1+ count))))

    (let loop ((line (next)))
      (cond ((eof-object? line)
             (count-maybe!))
            ((regexp-exec *from-rx* line)
             (count-maybe!)
             (set! state 'headers)
             (loop (next)))
            (else
             (and (eq? 'headers state)
                  (string-null? line)
                  (set! state 'body))
             (and (eq? 'body state)
                  (grok-line line))
             (loop (next)))))

    (and (opt 'report-progress?)
         (fso "~A: ~A message~A~%"
              file count (if (= 1 count) "" "s")))))

(define (results)
  (sort (hash-fold (lambda (key value alist)
                     (acons value key alist))
                   (list)
                   *c*)
        (lambda (a b) (> (car a) (car b)))))

(define (output sorted)
  (FE sorted (lambda (x)
               (fso "~A\t~A~%" (car x) (cdr x)))))

(define (main/qop qop)
  (and (qop 'quiet) (opt 'report-progress? #f))
  (qop 'exclude set-exclude-list)
  (qop 'minimum set-min-word-length)
  (qop 'maximum set-max-word-length)
  (and (qop 'case-insensitive) (opt 'case-insensitive #t))
  (FE (qop '()) parse-file)
  (cond ((results) => output))
  #t)

(define (main args)
  (check-hv args '((package . "ttn-do")
                   (version . "1.1")
                   ;; 1.1 -- speed up
                   ;; 1.0 -- initial
                   (help . commentary)))
  (main/qop
   (qop<-args
    args '((quiet            (single-char #\q))
           (exclude          (single-char #\x) (value #t))
           (case-insensitive (single-char #\i))
           (minimum          (single-char #\m) (value #t))
           (maximum          (single-char #\M) (value #t))))))

;;; concordance ends here