#!/bin/sh
exec ${GUILE-guile} -e '(ttn-do gutenberg2mbox)' -s $0 "$@" # -*-scheme-*-
!#
;;; gutenberg2mbox

;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2009, 2010 Thien-Thi Nguyen
;;
;; This file is part of ttn-do, released under the terms of the
;; GNU General Public License as published by the Free Software
;; Foundation; either version 3, or (at your option) any later
;; version.  There is NO WARRANTY.  See file COPYING for details.

;;; Commentary:

;; Usage: gutenberg2mbox [options] GUTENBERG-FILE
;;
;; Convert GUTENBERG-FILE to an mbox file, writing GUTENBERG-FILE.mbox.
;; Lines up to and including the one that matches a preamble-end regexp,
;; as well as those starting with the one that matches a postamble-begin
;; regexp, are discarded.
;;
;; Each chapter becomes a message.  The first message is the title page.
;; The title page is taken from the lines between the preamble and the
;; first line that matches a chapter-begin regexp.
;;
;; The title is taken from the first non-empty line on the title page.
;; The subject of each message is made from the first two lines of the
;; chapter, in this format: LINE-1 -- LINE-2.
;;
;;  -v, --verbose            -- display message subject lines
;;  -o, --output FILENAME    -- write output to FILENAME
;;      --preamble-end RX    -- use RX as the preamble-end regexp
;;                              [default: "^.END.THE SMALL PRINT"]
;;      --chapter-begin RX   -- use RX as the chapter-begin regexp
;;                              [default: "^Chapter "]
;;      --subject-skip N     -- skip N presumably-blank lines to find
;;                              LINE-1 and LINE-2 [default: 0]
;;      --postamble-begin RX -- use RX as the postamble-begin regexp
;;
;; The postamble-begin regexp is optional; there is no default for it.
;; If there is an error, the output file is not written.

;;; Code:

(define-module (ttn-do gutenberg2mbox)
  #:export (main)
  #:use-module ((ttn-do zzz banalities) #:select (check-hv
                                                  qop<-args))
  #:use-module ((ice-9 rdelim) #:select (read-line))
  #:use-module ((srfi srfi-13) #:select (string-trim-both))
  #:use-module ((srfi srfi-14) #:select (char-set))
  #:use-module ((ttn-do zzz personally) #:select (accumulator
                                                  FE fs make-fso)))

(define eol (char-set #\cr #\nl))

(define (convert qop)
  (let* ((file (car (qop '())))
         (port (open-input-file file))
         (next (lambda () (let ((v (read-line port)))
                            (and (not (eof-object? v))
                                 (string-trim-both v eol)))))
         (chrx (make-regexp (or (qop 'chapter-begin)
                                "^Chapter ")))
         (backup-line #f)               ; ugh
         (title #f)
         (title-page (accumulator))
         (headers #f))
    ;; ignore preamble
    (let ((rx (make-regexp (or (qop 'preamble-end)
                               "^.END.THE SMALL PRINT"))))
      (let loop ((line (next)))
        (or line (error "Could not find end of preamble"))
        (or (regexp-exec rx line)
            (loop (next)))))
    ;; compute title and title-page; stash header, too
    (let loop ((line (next)))
      (cond ((regexp-exec chrx line)
             (set! backup-line line)
             (set! headers (fs "From ~A ~A~%From: ~A"
                               file (strftime "%c" (localtime (stat:mtime
                                                               (stat file))))
                               (or title "(untitled)"))))
            (else
             (or title (string-null? line)
                 (set! title (string-trim-both line)))
             (title-page line)
             (loop (next)))))
    ;; snarf chapters
    (let* ((lines #f)
           (spew (if (qop 'verbose) display identity))
           (subj-skip (or (qop 'subject-skip string->number) 0))
           (postrx (qop 'postamble-begin make-regexp))
           (fo (make-fso (open-output-file
                          (or (qop 'output)
                              (fs "~A.mbox" file))))))
      (define (new-chapter! init)
        (set! lines (accumulator))
        (lines init))
      (define (out! chapter-lines)
        (fo "~A~%" headers)
        (let* ((rest (list-tail (cdr chapter-lines) subj-skip))
               (subj (fs "Subject: ~A -- ~A~%"
                         (car chapter-lines)
                         (car rest))))
          (spew subj)
          (fo "~A~%" subj)
          (FE (cdr rest)
              (lambda (line)
                (fo "~A~%" line)))))
      (out! (append! (cons "title-page" (make-list (1+ subj-skip) ""))
                     (title-page)))
      (new-chapter! backup-line)
      (let loop ((line (next)))
        (cond ((or (not line) (and postrx (regexp-exec postrx line)))
               (out! (lines)))
              ((regexp-exec chrx line)
               (out! (lines))
               (new-chapter! line)
               (loop (next)))
              (else
               (lines line)
               (loop (next))))))))

(define (main args)
  (check-hv args '((package . "ttn-do")
                   (version . "2.0")
                   ;; 2.0 -- w/ options, only one file
                   ;; 1.0 -- no options, multiple files
                   (help . commentary)))
  (convert
   (qop<-args
    args '((verbose (single-char #\v))
           (preamble-end (value #t))
           (chapter-begin (value #t))
           (postamble-begin (value #t))
           (subject-skip (value #t))
           (output (single-char #\o) (value #t))))))

;;; gutenberg2mbox ends here