#! /usr/bin/env perl
# burst.pl -- Burst a specially formatted text file into separate files.
# Kirk Lougheed, 5/16/2001

# $Id: burst.pl,v 1.14 2005/12/18 04:43:13 kirk Exp kirk $

#
# Set one of the following variables to 1.
# This tells burst.pl what sort of file you are expanding.
#
$netscape = 1;          # burst Netscape/Mozilla/Thunderbird mail files
$outlook = 0;           # burst Outlook batch files
$oexpress = 0;          # Rename Outlook Express files 
$agent = 0;             # burst Forte Agent batch files
$pegasus = 0;           # burst Pegasus batch files
$eudora = 0;            # burst Eudora mailbox file

#
# Normally burst.pl prints a log to the screen.
# On a Windows system, this log vanishes when burst.pl completes,
# unless you are running burst.pl in a DOS window.  If you want
# burst.pl to leave behind a log file, set $logfile to 1.
# You may also change the default filename from "errors.txt" to
# something more to your liking.
#
$logfile = 0;
$logfilename = "errors.txt";

#
# Part of the log is a set of personal names and initials in
# the format used in the names.txt file.  We normally generate
# this, except for Agent files, which don't have personal names.
#
$namesflag = 1;
@namelist = ();
if ($agent) {
    $namesflag = 0;
}

#
# If we're writing to a logfile, set it up.
#
if ($logfile) {
    close(STDERR);
    open(STDERR, ">".$logfilename);
}

#
# Get input files.  Wildcards are okay.
#
@inputfiles = get_ttyinput("Input File: ", "");
@inputfiles = <@inputfiles>;

#
# Get a subdirectory of our current directory.
# Create it if it doesn't already exist.
#
$outdir = get_ttyinput("Output Directory: ", "");
if (not -e $outdir) {
    mkdir($outdir,0777) || die "Can't create $outdir: $!\n";
}

#
# Loop over all files.
# In "Agent mode" a new contribution starts with "From user@domain ".
# In "Netscape mode" a new contribution starts with "From - ".
# In "Pegasus mode" a new contribution ends with "-- End --".
# In "Outloook mode" a new contribution starts with "From:\t".
# In "Eudora mode" a new contribution starts with "From ???@??? ".
# In "Outlook Express mode" each file is a new contribution.
#
# Figure out the userid and possible initials.
# Write the contribution into a file that has initials, userid, or "somebody"
# as the name.  If we don't come up with initials the first time, assume
# that stuff like "PR 1 KSL " has turned into "PR1KSL" or "PR1 KSL".
# Files we have fixed up, or could not find initials for, have names
# that start with a "+" to make them more obvious.
#
foreach $file (@inputfiles) {
    $mode = get_mode();
    print STDERR "\nProcessing $file assuming $mode format\n";
    $text = slurp_file($file);
    $text =~ s/\s+\Z/\n/s;     # only a single \n ends the input file
    while (length($text)) {
        #
        # We will clear $trash if we find a valid piece of email.
        #
        $trash = 1;

        #
        # A Eudora-style mail file
        #
        if ($eudora) {
            if ($text =~ m/(From \?\?\?@\?\?\? .+?){1}(From \?\?\?@\?\?\? |\Z)/sm) {
                $trash = 0;
                $email = $1;
                $text = $2.$';
                $userid = "somebody";
                remove_encoding(\$email);
                if ($email =~ m/^From:.+?\b([A-Za-z0-9\.-_]+?)@/sm) {
                   $userid = $1;
                }
                $email =~ m/^From:\s+(.*?)$/sm;
                $nameline = $1;
                $nameline =~ s/\"//g;
            }
        }

        #
        # A Netscape-style mail file
        #
        if ($netscape) {
            if ($text =~ m/(^From - .+?\n){1}(^From - |\Z)/sm) {
                $trash = 0;
                $email = $1;
                $text = $2.$';
                $userid = "somebody";
                remove_encoding(\$email);
                if ($email =~ m/^From:.+?\b([A-Za-z0-9\.-_]+?)@/sm) {
                   $userid = $1;
                }
                $email =~ m/^From:\s+(.*?)$/sm;
                $nameline = $1;
                $nameline =~ s/\"//g;
            }
        }
                
        #
        # An Outlook Express -style mail file
        # We are passing the entire file through to extract information
        # and it give it a better name.
        #
        if ($oexpress) {
             $trash = 0;
             $userid = "somebody";
             if ($text =~ m/^From:.+?\b([A-Za-z0-9\.-_]+?)@/sm) {
                $userid = $1;
             }
             $email = $text;
             remove_encoding(\$email);
             $email =~ m/^From:\s+(.*?)$/sm;
             $nameline = $1;
             $nameline =~ s/\"//g;
             $text = "";
        }
                
        #
        # An Outlook-style saved mail file
        # Note that "From:\t" need not start on a new line
        # It also has [] around the email IDs for some reason.
        #
        if ($outlook) {
            if ($text =~ m/(From:\s+.+?){1}(From:\s+|\Z)/s) {
                $trash = 0;
                $email = $1;
                $text = $2.$';
                $userid = "somebody";
                remove_encoding(\$email);
                if ($email =~ m/From:.+?\b([A-Za-z0-9\.-_]+?)@/sm) {
                   $userid = $1;
                }
                $email =~ m/From:\s+(.*?)$/sm;
                $nameline = $1;
                $nameline =~ s/\[/</g;
                $nameline =~ s/\]/>/g;
            }
        }

        #
        # An Agent batch-save file
        # Agent files do not have personal name information.
        #
        if ($agent) {
            if ($text =~ m/(^From (.+?)@(.+?)\n){1}(^From |\Z)/sm) {
                $trash = 0;
                $email = $1;
                $userid = $2;
                $text = $4.$';
                remove_encoding(\$email);
            }
        }

        #
        # A Pegasus batch-save file
        #
        if ($pegasus) {
            if ($text =~ m/(\A.+?\n-- End --\n)/sm) {
                $trash = 0;
                $email = $1;
                $text = $';
                $userid = "somebody";
                remove_encoding(\$email);
                if ($email =~ m/^From:.+?\b([A-Za-z0-9\.-_]+?)@/sm) {
                   $userid = $1;
                }
                $email =~ m/^From:\s+(.*?)$/sm;
                $nameline = $1;
                $nameline =~ s/\"//g;
            }
        }

        #
        # If the remaining text is just a single end-of-line, 
        # quietly null it out.  \Z matches just before last newline.
        #
        if ($text eq "\n") {
            $text = "";
        }

        #
        # If $trash is set, we have a formatting problem.
        # Write the remainder of our text to the file "somebody.txt"
        # Clear $text so we finish with this file.
        #
        if ($trash) {
            $email = $text;
            $userid = "somebody";
            $text = "";     
        }

        #
        # Remove a single level for forwarding comment.
        # Doesn't touch a line starting with ">>", our indentation token.
        #
        $email =~ s/^>([^>])/$1/mg;

        #
        # Determine initials.
        # If that doesn't work, try various fixups on a copy and look again.
        #
        if ($email =~ m/^\s*(PR|SA|TR|GM)\s+\d{1,2}\.?\s+([A-Za-z]{2,3})\s+/m) {
            $initials = $2;
        } else {
            $initials = "+$userid";
            for (;;) {

                # Handle "PR1KL", "PR1KL:",  "PR1 KL", "PR1 KL:", "PR 1 KL:"
                # Ignore periods and commas after the numbers.
                #
                $munge = $email;
                $munge =~ s/^\s*(PR|SA|TR|GM)\s*(\d{1,2})\.?,?\s*([A-Za-z]{2,3}):?/\n\U$1\E $2 \U$3\E /mg;
                if ($munge =~ m/^\s*(PR|SA|TR|GM)\s+\d{1,2}\s+([A-Za-z]{2,3})\s+/m) {
                    $initials = "+$2";
                    $email = $munge;
                    last;
                }

                # Handle swapped "PR KL 1", possibly ending in a colon, period, or comma

                $munge = $email;
                $munge =~ s/^\s*(PR|SA|TR|GM)\s+([A-Za-z]{2,3})\s+(\d{1,2})[\.:,]?/\n\U$1\E $3 \U$2\E /mg;
                if ($munge =~ m/^\s*(PR|SA|TR|GM)\s+\d{1,2}\s+([A-Za-z]{2,3})\s+/m) {
                    $initials = "+$2";
                    $email = $munge;
                    last;
                }

                # Insert another fixup here when the newbies get more creative

                last;
            }
        }
        $outfile = get_filename($outdir, $initials);
        print STDERR "  -> $userid ($initials) -> $outfile\n";
        if ($namesflag) {
            $initials .= " " if (length($initials) == 2);
            push(@namelist, "person:$initials  $nameline\n");
        }
        open(OUTPUT, ">$outfile");
        print OUTPUT $email;
        close(OUTPUT);
        if ($^O eq 'MacOS') {
            &MacPerl'SetFileInfo("ttxt", "TEXT", $outfile);  # SimpleText
        }
    }

    #
    # Write out the sorted names.txt material if we have it.
    #
    if ($namesflag) {
        print STDERR "\n\n";
        foreach (sort @namelist) {
            print STDERR $_;
        }
    }
}

#
# Close our logfile, if any.
#
if ($logfile) {
    close(STDERR);
    unlink($logfilename) if (-z $logfilename);
}

exit;

#
# get_filename
# Given a subdirectory and a user ID, create a new filespec.
# Make sure we don't overwrite any existing file.
#
# Can't rely on having the File:: package: handcraft the filenames
# so this works on MacOS, Unix, and Windows.
#
sub get_filename {
    my ($subdir, $username) = @_;
    my ($name, $tag, $count, $leadin, $slash);
    
    $username = lc $username;
    $username =~ s/\./_/g;         # Windows can't handle too many dots.
    $tag = "";
    $count = 0;
    $leadin = ($^O eq 'MacOS') ? ":" : "";
    $slash = ($^O eq 'MacOS') ? ":" : "/";
    do {
        $name = "$leadin$subdir$slash$username$tag.txt";
        $count++;
        $tag = "-$count";
    } until (not -e $name);
    return($name);
}

#
# slurp_file($filename)
# Open file, read contents into $text, close file, and return the string
# 
sub slurp_file {
    my ($file) = @_;
    my $text;
    open(INFILE, $file) or die "Can't open $file for input: $!\n";
    select(INFILE);
    undef $/;             # no input line terminator
    $text = <INFILE>;     # read everything at once
    $text =~ s/\r\n?/\n/g;  # turn CR (Mac) or CRLF into LF (Unix/Windows)
    close(INFILE);
    $/ = "\n";            # be nice to other callers
    return($text);
}

#
# get_ttyinput
# Given a prompt and a default, read from STDIN and return a string
#
sub get_ttyinput {
    my ($prompt, $default) = @_;
    my ($line);
    print STDOUT $prompt;
    chop($line = <STDIN>);
    if ($line eq "") {
        $line = $default;
    }
    return($line);
}

#
# get_mode
# Return string describing our mode.
#
sub get_mode {
    if ($eudora) { return("Eudora"); }
    if ($netscape) { return("Netscape/Mozilla/Thunderbird"); }
    if ($outlook)  { return("Outlook"); }
    if ($oexpress) { return("Outlook Express"); }
    if ($agent)    { return("Forte Agent"); }
    if ($pegasus)  { return("Pegasus"); }
    return("Unknown");
}


#
# remove_encoding
# Remove base64 encoding if it is present.
# We save the header, then remove any html crud.  If either the data
# section or the header still mentions base64, decode the data section.
#
sub remove_encoding {
    my ($data) = @_;
    return if (not $$data =~ m/base64/ig);
    $$data =~ s/(\A.+?\n\n)//s;
    $header = $1;
    $$data =~ s/<HTML.*\Z//is;
    $$data =~ s/<!DOCTYPE HTML.*\Z//is;
    if ($header =~ m/base64/g || $$data =~ m/base64/g) {
        base64_decode(\$$data);
    }
    $$data = $header."\n".$$data
}


#
# base64_decode
# Code a  base64 encoded MIME section.
#
sub base64_decode {
    my ($data) = @_;
    @rawarray = split(/\n/, $$data, -1);
    @encarray = ();
    foreach (@rawarray) {
        next if (m|[^A-Za-z0-9+=/]| or $_ eq "");
        push(@encarray, $_);
    }
    $str = join '', @encarray;
    if (length($str) % 4) {
#        print STDERR "Length of base64 data not a multiple of four";
        return;
    }
    $str =~ s/=+$//;                        # remove padding
    $str =~ tr|A-Za-z0-9+/| -_|;            # convert to uuencoded format
    $$data = join'', map(unpack("u", chr(32 + length($_)*3/4) . $_),
                         $str =~ /(.{1,60})/gs);
    $$data =~ s/\r\n?/\n/g;  # turn CR (Mac) or CRLF into LF (Unix/Windows)

}

# end of burst.pl

