#!/bin/sh
exec ${GUILE-guile} -e '(ttn-do concordance)' -s $0 "$@" # -*-scheme-*-
!#
(define-module (ttn-do concordance)
#:export (main)
#:use-module ((ttn-do zzz banalities) #:select (check-hv
qop<-args))
#:use-module ((srfi srfi-13) #:select (string-tokenize))
#:use-module ((srfi srfi-14) #:select (char-set-complement
char-set))
#:use-module ((ice-9 rdelim) #:select (read-line))
#:use-module ((ice-9 regex) #:select (match:substring
match:start
match:end))
#:use-module ((ttn-do zzz personally) #:select (FE fso)))
(define *c* (make-hash-table 99991)) (define *x* (make-hash-table 997)) (define *o* '((report-progress? . #t) (min-word-length . 1) (max-word-length . 59) (case-insensitive . #f)))
(define (set-exclude-list vals)
(FE (string-tokenize vals (char-set-complement (char-set #\,)))
(lambda (one-x)
(hashq-create-handle! *x* (string->symbol one-x) #t))))
(define (opt option . newval)
(if (null? newval)
(assq-ref *o* option)
(cond ((assq option *o*)
=> (lambda (cell)
(set-cdr! cell (car newval)))))))
(define (set-min-word-length s)
(opt 'min-word-length (string->number s)))
(define (set-max-word-length s)
(opt 'max-word-length (string->number s)))
(define *word-rx* (make-regexp "[-_'a-zA-Z][-_'a-zA-Z0-9]*"))
(define *from-rx* (make-regexp "^From "))
(define (parse-file file) (let* ((minlen (opt 'min-word-length))
(maxlen (opt 'max-word-length))
(munge (if (opt 'case-insensitive)
(lambda (m)
(string->symbol (string-downcase (match:substring m))))
(lambda (m)
(string->symbol (match:substring m)))))
(port (open-input-file file))
(state #f) (count 0))
(define (grok-line line)
(let loop ((start 0))
(and=> (regexp-exec *word-rx* line start)
(lambda (m)
(let* ((end (match:end m))
(len (- end (match:start m))))
(cond ((> minlen len))
((< maxlen len))
(else
(let ((w (munge m)))
(cond ((hashq-get-handle *x* w))
((hashq-get-handle *c* w)
=> (lambda (h)
(set-cdr! h (1+ (cdr h)))))
(else
(hashq-create-handle! *c* w 1))))))
(loop end))))))
(define (next)
(read-line port))
(define (count-maybe!)
(and state (set! count (1+ count))))
(let loop ((line (next)))
(cond ((eof-object? line)
(count-maybe!))
((regexp-exec *from-rx* line)
(count-maybe!)
(set! state 'headers)
(loop (next)))
(else
(and (eq? 'headers state)
(string-null? line)
(set! state 'body))
(and (eq? 'body state)
(grok-line line))
(loop (next)))))
(and (opt 'report-progress?)
(fso "~A: ~A message~A~%"
file count (if (= 1 count) "" "s")))))
(define (results)
(sort (hash-fold (lambda (key value alist)
(acons value key alist))
(list)
*c*)
(lambda (a b) (> (car a) (car b)))))
(define (output sorted)
(FE sorted (lambda (x)
(fso "~A\t~A~%" (car x) (cdr x)))))
(define (main/qop qop)
(and (qop 'quiet) (opt 'report-progress? #f))
(qop 'exclude set-exclude-list)
(qop 'minimum set-min-word-length)
(qop 'maximum set-max-word-length)
(and (qop 'case-insensitive) (opt 'case-insensitive #t))
(FE (qop '()) parse-file)
(cond ((results) => output))
#t)
(define (main args)
(check-hv args '((package . "ttn-do")
(version . "1.1")
(help . commentary)))
(main/qop
(qop<-args
args '((quiet (single-char #\q))
(exclude (single-char #\x) (value #t))
(case-insensitive (single-char #\i))
(minimum (single-char #\m) (value #t))
(maximum (single-char #\M) (value #t))))))