/*
 *                          Copyright (c) 2001
 *                             Megan Gentry
 *                          All Rights Reserved
 *                  Commercial Distribution Prohibited
 *
 * This software may be  freely copied  and used in its entirety for any
 * purpose  so long as the above copyright notice and these comments are
 * preserved  in  the  source  form  of  this  software, and  the binary
 * copyright is preserved in any image built from it.
 *
 * The author has used best efforts in the research, design, development
 * and  testing of  this software.  The author  makes no warranty of any
 * kind,  expressed or  implied,  with  regard to  this software and its
 * suitability for a given application.  The author shall not  be liable
 * in any  event for  incidental or  consequential damages in connection
 * with, or arising out of, the use or performance of this software. Use
 * of this software constitutes acceptance of these terms.
 *
 * The author  is committed to making a best effort at fixing any errors
 * found  in the  software and  would welcome  any reports  of problems,
 * comments  or suggestions  regarding the software.   Please send email
 * to <mbg@world.std.com>.
 */


/*
 * Abstract and Edit History
 *
 * dmextract
 *	This program was designed to read the contents of a DECmail-11
 *	.MAI file (an indexed file used on RSX and RSTS machines) and
 *	to extract the messages so that they can then be read using
 *	standard U*x mail utilities.  Input is from the specified file,
 *	while output is to standard output so that it can either be
 *	filtered or redirected to an output file.
 *
 * Edit History:
 *
 * (000) 07-Dec-2001 Megan Gentry
 *	Final coding so that the program dumps most of my message
 *	files.  The contents of the file it writes are still not
 *	properly processed by U*x mail(x) programs -- they concatenate
 *	some messages (even though all messages are written in
 *	exactly the same sequence... maybe there is a problem with
 *	header or message content).
 *
 * (001) 07-Dec-2001 Megan Gentry
 *	Okay, the 'From' line appears to need more than one field
 *	on the line for the message to be properly recognized.  A
 *	temporary workaround has been to output something else (like
 *	the date) for those 'From' lines with only one field.
 *	Now the problem is that some lines (notably those from
 *	VMS hosts) have names or witty sayings within quotes in
 *	the from field, which display improperly using U*x mail.
 *
 */


#include <stdio.h>
#include <string.h>

#define BLOCK	(512)

char copyright[] = "dmextract.c, Copyright (c) 2001 by Megan Gentry";

typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;

int debug = 0;

/*
 * Message descriptor structure
 */

struct msginfo {
    struct msginfo *next;		/* Pointer to next record */
    struct msginfo *prev;		/* Pointer to previous record */
					/*   not really neaded */
    ulong number;			/* Message number */
    struct {
	ulong	spos;			/* File position for header start */
        ulong	epos;			/* File position for header end */
        char	from[256];		/* From record from header */
        char	date[256];		/* Date record from header */
        char	to[256];		/* To record from header */
        char	cc[256];		/* Cc record from header */
        char	subject[256];		/* Subj record from header */
	char	mid[256];		/* For duplicate date (?) from header */
    } header;
    struct {
        ulong	spos;			/* File position for text start */
        ulong	epos;			/* File position for text end */
        ulong	lines;			/* Count of lines of text */
    } text;
};

/*
 * Message descriptor list head structure
 */

struct msghead {
    struct msginfo *head;
    struct msginfo *tail;
    ulong count;
};

struct msghead msglist;			/* Head of message descriptor list */

char cuspname[132];			/* Execution name of this program */
char mailfile[132];			/* Name of mailfile to be referenced */

uchar buffer[BLOCK];


unsigned short
getushort(fp)
    FILE *fp;
{
    char sbuf[2];
    short lo, hi;

    fread (sbuf, 2, 1, fp);

    lo = sbuf[0] & 0377;
    hi = sbuf[1] & 0377;
    return (hi * 0400 + lo);
}


main (argc, argv)
    int argc;
    char *argv[];
{
    FILE *fopen(), *fp;
    int i, j, n;
    int reclen;
    char ch;
    uchar c1, c2, c3, c4;
    int hdrfg, hdrct;
    int txtfg, txtct, prtfg;
    ushort *wordp;
    struct msginfo *msgp;

    /* Save the cusp name */
    strcpy (cuspname, argv[0]);

    /* Assume the mail.mai file */
    strcpy (mailfile, "mail.mai");

    if (argc == 2)
	strcpy (mailfile, argv[1]);
    else if (argc > 2) {
	fprintf (stderr, "usage: %s [mail_file]\n", cuspname);
	exit (1);
    }

    /*
     * In case file was specified without ".mai" filetype, append it
     */
    if (rindex(mailfile,'.') == NULL) {
	strcat (mailfile, ".mai");
	fprintf (stderr, "warning (0): using file %s\n", mailfile);
    }

    /* open the mail file */
    if ((fp = fopen(mailfile, "rb")) == NULL) {
	fprintf (stderr, "%s: file not found %s\n", cuspname, mailfile);
	exit (1);
    }


    /*
     * In this first pass, we scan the file to find the start and end
     * of all the message headers and text blocks in the file and build
     * a linked list of messages with that information.
     */
    bzero (msglist, sizeof(struct msghead));

    hdrfg = hdrct = 0;
    txtfg = txtct = 0;

    /* Preload the character pipe-line */
    c1 = getc (fp) & 0377;
    c2 = getc (fp) & 0377;
    c3 = getc (fp) & 0377;

    do {
	/* Get the next character from the file */
	c4 = getc (fp) & 0377;

	/*
	 * See if the current four characters delineate a
	 * message header
	 */
	if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0377) {
	    if (hdrfg == 0) {
		/* We've located the start of a message header */
		hdrfg = 1;
		hdrct++;

		if (txtfg) {
		    fprintf (stderr, "warning (1): message header found while processing message %d text block\n", txtct);
		    txtfg = 0;
		}

		/* Allocate a new message descriptor */
		msgp = (struct msginfo *) malloc (sizeof(struct msginfo));
		if (!msgp) {
		    fprintf (stderr, "fatal (1): unable to allocate memory for message %d\n", hdrct);
		    exit (1);
		}

		/* Initialize descriptor */
		bzero (msgp, sizeof(struct msginfo));
		msgp->number = hdrct;
		msgp->header.spos = ftell(fp)-4;

		/* Link the new descriptor into the list */
		if (msglist.head) {
		    msgp->prev = msglist.tail;
		    (msglist.tail)->next = msgp;
		    msglist.tail = msgp;
		} else {
		    msglist.head = msgp;
		    msglist.tail = msgp;
		}

		/* Keep track of number of messages */
		msglist.count++;
	    } else {
		/* We've located the end of the message header */
		msgp->header.epos = ftell(fp)-4;
		hdrfg = 0;
	    }
	}

	/*
	 * See if the current four characters delineate a
	 * text block
	 */
	if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0204) {
	    if (txtfg == 0) {
		/* We've located the beginning of the message */
		txtfg = 1;
		txtct++;
		msgp->text.spos = ftell(fp)-4;
	    } else {
		/* We've located the end of the message */
		msgp->text.epos = ftell(fp)-4;
		txtfg = 0;
	    }
	}

	/* The pipe-line shifts */
	c1 = c2;
	c2 = c3;
	c3 = c4;
    } while (!feof(fp));

    /*
     * Verify that we have the same number of headers as text blocks
     */
    if (hdrct != txtct) {
	fprintf (stderr, "warning (1): %d headers, %d text blocks\n",
			hdrct, txtct);
    }

    fprintf (stderr, "info (1): %s appears to contain %d %s\n",
		mailfile, msglist.count,
		msglist.count == 1 ? "message" : "messages");


    /*
     * In this pass, we extract message header information to fill
     * each entry in our message descriptor list.
     */
    for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
	fseek (fp, msgp->header.spos+4, SEEK_SET);
	while (ftell(fp) < msgp->header.epos
			&& (reclen = getushort(fp)) != 01) {
	    fread (buffer, ((reclen+1)&~1), 1, fp);
	    reclen = reclen <= 255 ? reclen : 255;
	    buffer[reclen] = 0;
	    switch ((uchar)buffer[0]) {
	      case 0231:		/* Undocumented field type */
					/* Appears to duplicate date */
		strcpy (&msgp->header.mid[0], &buffer[1]);
		if (strlen(&msgp->header.date[0]) == 0)
		    strcpy (&msgp->header.date[0], &msgp->header.mid[0]);
		break;
	      case 0201:		/* From: field */
		strcpy (&msgp->header.from[0], &buffer[1]);
		break;
	      case 0202:		/* Date: field */
		strcpy (&msgp->header.date[0], &buffer[1]);
		break;
	      case 0205:		/* To: field */
		strcpy (&msgp->header.to[0], &buffer[1]);
		break;
	      case 0206:		/* Cc: field */
		strcpy (&msgp->header.cc[0], &buffer[1]);
		break;
	      case 0207:		/* Subject: field */
		strcpy (&msgp->header.subject[0], &buffer[1]);
		break;
	    }
	}
    }


    /*
     * In this pass, we process the counted records comprising the
     * message to 1) count the number of lines of text so we can
     * report it, and 2) so that we can ensure that processing
     * continues to the end of a message block.
     */

    /*
     * NOTE:
     *	It appears that different versions of DECmail did different
     *	things with regard to the text records.  I believe they are
     *	are supposed to be counted records, two bytes count, with
     *	n bytes of text, padded with null bytes to account for
     *	odd counts.  The problem is that this is not what I found in
     *	practice in all cases.  And in some cases, the counts are
     *	fine up to some point, and then they are simply wrong.
     *	So, combining the algorithm I use for determining line length
     *	with a recovery algorithm when the next line's record count
     *	looks wrong, this seems to work best.  At the worst, I've
     *	noticed that sometimes, the final character of a line will
     *	be lost for all lines of a message, but not all lines of
     *	all messages in that same file.
     */
    for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
	int linct = 0;
	fseek (fp, msgp->text.spos+4, SEEK_SET);
	while (ftell(fp) < msgp->text.epos) {
	    reclen = getushort(fp);
	    linct++;
	    if (reclen > 255) {
		fprintf (stderr, "warning (3): Message %04d, Line %04d, filpos 0%012lo, reclen %05d\n",
			msgp->number, linct, ftell(fp)-2, reclen);
		fprintf (stderr, "info (3): adjusting file position\n");
		fseek (fp, ftell(fp)-1, SEEK_SET);
		--linct;
		continue;
	    }
	    if (reclen == 0) continue;
	    if (reclen & 01)
		fread (buffer, reclen, 1, fp);
	    else
		fread (buffer, reclen-1, 1, fp);
	    msgp->text.lines++;
	}
	if (ftell(fp) == msgp->text.epos) continue;
	fprintf (stderr, "error (3): While processing message %d:\n",
			msgp->number);
	fprintf (stderr, "  Processing ended at file position 0%012lo\n",
			ftell(fp));
	fprintf (stderr, "   Text block ends at file position 0%012lo\n",
			msgp->text.epos);
    }


    /*
     * In this pass, we finally start outputting the information
     * obtained from the headers, followed by the associated
     * message text.  Output is done in such a way that it can
     * be read by U*x standard mail utilities (hopefully).
     */
    for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {

	/*
	 * Output the 'From' field.  If there is only one
	 * field in the text, add the date as a second field
	 * on the line so that U*x mail utilities can properly
	 * identify starts of messages
	 */
	if (index(msgp->header.from,' '))
	    printf ("From %s\n", &msgp->header.from[0]);
	else
	    printf ("From %s %s\n",
			&msgp->header.from[0],
			&msgp->header.date[0]);

	printf ("From: %s\n", &msgp->header.from[0]);

	/* There should always be a recipient specified */
	printf ("To: %s\n", &msgp->header.to[0]);

	/* There doesn't always have to be a CC list */
	if (strlen(&msgp->header.cc[0]))
	    printf ("Cc: %s\n", &msgp->header.cc[0]);

	/* There always has to be a date */
	printf ("Date: %s\n", &msgp->header.date[0]);

	/* There doesn't always have to be a subject */
	if (strlen(&msgp->header.subject[0]))
	    printf ("Subject: %s\n", &msgp->header.subject[0]);

	/* Let's assume the mail files were read at some point */
	printf ("Status: RO\n");

	/* This is dmextract-specific info for debugging */
	printf ("DECmail-Info: %s\n", &msgp->header.mid[0]);
	printf ("DMextract-Number: %d\n", msgp->number);
	printf ("DMline-Count: %d\n", msgp->text.lines);

	/* Separate header from text with a blank line */
	printf ("\n");

	/* Now we start outputting the message text */
	fseek (fp, msgp->text.spos+4, SEEK_SET);
	while (ftell(fp) < msgp->text.epos) {
	    reclen = getushort(fp);
	    if (reclen > 255) {
		fseek (fp, ftell(fp)-1, SEEK_SET);
		continue;
	    }
	    if (reclen == 0) continue;
	    if (reclen & 01)
		fread (buffer, reclen, 1, fp);
	    else
		fread (buffer, reclen-1, 1, fp);
	    buffer[reclen] = 0;
	    if (strncmp(&buffer[0],"From",4) == 0)
		printf (">");
	    for (i = 0; i < reclen-1; i++) {
		if (buffer[i] == 012) break;
		if (buffer[i] == 015) {
		    (void) getc (fp);
		    continue;
		}
		printf ("%c", buffer[i]);
	    }
	    printf ("\n");
	}
	printf ("\n");
    }
}