ulfben

Pidgin2Mail

Feb 3rd, 2014
139
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #Pidgin2Mail parses Pidgin HTML chatlogs, and uploads them to an email server.
  2. #Pidgin creates a new log file every time you open a chat window, so many short logs are created if you often close your chat windows.
  3. #This script will merge all files per day, so each contact generates 1 email/day, instead of 1 email per HTML log file.
  4.  
  5. #There's 6 classes in this document, sorry. Search for "CONFIG" and adjust as needed.
  6. #It boils down to: a path to output a  log-file. A path to your chat-logs, your user account info for your mail, and you mail server's adress
  7.  
  8. #Before running it, take a look at your Pidgin log-files. If the content of the <title>-tag doesn't match
  9. #mine (say, you're running Pidgin in a different language), adjust these in the script too.
  10.  
  11. # //Ulf Benjaminsson, ulfben.com, ulfben at gmail dot com
  12.  
  13.  
  14. package Utils{
  15.     use strict; use warnings;
  16.     sub Utils::getStringBetween{
  17.         #($title) = $str =~ m/ '<title>' (.*) '</title>' /;
  18.         my $str = $_[0];
  19.         my $start = $_[1];
  20.         my $end = $_[2];
  21.         my $startpos = length($start) + index($str, $start);
  22.         my $endpos = index($str, $end, $startpos) - length($str); #rindex($str, $end)
  23.         my $result = substr($str, $startpos, $endpos) or warn Utils::logit("\t\t\tINFO: Error in getStringBetween:\n\t\tStart: '$start'\n\t\tEnd: '$end'\n\t '$str'");
  24.         return $result;
  25.     }
  26.  
  27.     my $logdir='D:\Dropbox\Applications\PidginPortable\Data\settings\.purple\Pidgin2Mail\\'; #CONFIG
  28.     sub Utils::logit
  29.     {
  30.         my $s = shift;
  31.         my $eol = shift;
  32.         $eol = $eol ? $eol : "\n";
  33.         my ($logsec,$logmin,$loghour,$logmday,$logmon,$logyear,$logwday,$logyday,$logisdst)=localtime(time);
  34.         my $logtimestamp = sprintf("%4d-%02d-%02d %02d:%02d:%02d",$logyear+1900,$logmon+1,$logmday,$loghour,$logmin,$logsec);      
  35.         my $logfile = sprintf("%s%4d-%02d-%02d.%s",$logdir,$logyear+1900,$logmon+1,$logmday,'txt');            
  36.         my $fh;
  37.         open($fh, '>>', "$logfile") or warn "$logfile: $!";
  38.             print $fh "$logtimestamp\t $s\n";          
  39.         close($fh);
  40.         print "$logtimestamp\t $s $eol";
  41.         return "$logtimestamp\t $s $eol";
  42.     }
  43. }
  44.  
  45. #Holds all instances of PidginLogs, sorts them and, optionally, merges them into daily batches.
  46. package LogContainer{
  47.     use strict; use warnings;
  48.     my %_logs = ();
  49.     my @_hashes = ();
  50.    
  51.     sub new{
  52.         my ($class, %attrs) = @_;      
  53.         bless \%attrs, $class;
  54.     }
  55.    
  56.     sub addLog{
  57.         my ($self, $pidginlog) = @_;       
  58.         my $type = $pidginlog->{chat_protocol};
  59.         my $receiver = $pidginlog->{receiver};
  60.         my $sender = $pidginlog->{sender};     
  61.         my $ymd = $pidginlog->{dateTime}->ymd('-'); # yyyy-mm-dd
  62.         my $name = $pidginlog->{name}; # 2007-08-28.130702+0200CEST assumed unique
  63.         my $bucket = "$type $receiver $sender $ymd"; #assuming whitespace will never appear in either of these.
  64.         if(!$_logs{$bucket}){
  65.             $_logs{$bucket} = ();          
  66.             push(@_hashes, $bucket);           
  67.         }
  68.         if($_logs{$bucket}{$name}){
  69.             die Utils::logit("INFO: $name is not unique, at LogContainer::addLog");
  70.         }
  71.         $_logs{$bucket}{$name} = $pidginlog;       
  72.     }
  73.    
  74.     sub merge{
  75.         my $self = shift;
  76.         my $count = $self->getCount();
  77.         Utils::logit("INFO: Starting merge with $count files in ".@_hashes." buckets.");
  78.         @_hashes = sort { lc($a) cmp lc($b) } @_hashes; #sorting hashes, just to keep the logs readable.
  79.         foreach my $bucket(@_hashes){
  80.             my %logs = %{$_logs{$bucket}};         
  81.             my @keys = $self->getSortedKeys($bucket) or next;                      
  82.             if(@keys < 2){ next; }         
  83.             Utils::logit(@keys." logs in '$bucket' before merge.");            
  84.             my $first = $logs{shift @keys};                    
  85.             foreach my $logname(@keys){            
  86.                 $first->appendLogContent($logs{$logname});             
  87.                 delete $_logs{$bucket}{$logname};                          
  88.             }          
  89.         }  
  90.         $count = $self->getCount();    
  91.         Utils::logit("INFO: Merge completed. There's now $count logs in ".@_hashes." buckets.");
  92.     }
  93.    
  94.     sub getSortedKeys{
  95.         my ($self, $bucket) = @_;                  
  96.         return sort { lc($a) cmp lc($b) } keys %{$_logs{$bucket}};             
  97.     }
  98.    
  99.     sub get{
  100.         my ($self, $bucket) = @_;
  101.         my @result = ();
  102.         my @keys = $self->getSortedKeys($bucket) or Utils::logit("INFO: Invalid bucket '$bucket' in ChatLogs::get");
  103.         foreach my $logname(@keys){            
  104.             Utils::logit("INFO: Invalid logname '$logname' in ChatLogs::get ") unless $_logs{$bucket}{$logname};
  105.             push(@result, $_logs{$bucket}{$logname});          
  106.         }      
  107.         return @result;
  108.     }
  109.        
  110.     sub getBuckets{    
  111.         return @_hashes;
  112.     }
  113.  
  114.     sub getCount{
  115.         my $count = 0;
  116.         foreach my $bucket(@_hashes){
  117.             $count += keys %{$_logs{$bucket}};
  118.         }
  119.         return $count;
  120.     }
  121. }
  122.  
  123. package PidginLog{
  124.     use strict; use warnings;
  125.     use DateTime::Format::Mail;
  126.     use DateTime::Format::RFC3501;
  127.     use feature qw(switch);
  128.     #   MAYBE CONFIG
  129.     #       if your Pidgin logs in another language, make sure these match the <title>-tag of your chat logs.
  130.     my $_sep1 = 'Conversation with ';
  131.     my $_sep2 = ' at ';
  132.     my $_sep3 = ' on ';
  133.     my $_sep4 = ' (';
  134.     my $_sep5 = ')';
  135.     #   CONFIG
  136.    
  137.     my $name;               #'2007-08-28.121917+0200CEST'
  138.     my $UID;                #an identifier of each file, to ensure we don't parse or submit stuff we've already done.
  139.     my @UIDs;               #identifiers of files that were merged with this log.
  140.     my $rawContent;         #full file content
  141.     my $subject;            #'Conversation with x@x.com at 2007-08-28 12:19:17 on y@y.com (msn)'
  142.     my $body;               #all html between <body>-tags
  143.     my $sender;             #'x@x.com'
  144.     my $timestamp;          #timestamp from the log text. Inconsistent, and thus - unused.
  145.     my $receiver;           #'y@y.com'
  146.     my $chat_protocol;      #'msn'
  147.     my $dateTime;           # DateTime-object. '%Y-%m-%d.%H%M%S%z'  2007-08-28T12:19:17
  148.     my $headertime;         #'Tue, 28 Aug 2007 12:19:17 +0200'
  149.     my $inboxtime;          #'28-Aug-2007 12:19:17 +0200'
  150.    
  151.     sub new {
  152.         my ($class) = shift;
  153.         my $self  = { @_ };    
  154.         return undef unless defined $self->{name} and defined $self->{rawContent} and defined $self->{dateTime} and defined $self->{UID};                  
  155.         return undef unless (index($self->{rawContent}, '<title>') >= 0) && (index($self->{rawContent}, '</title>') >= 0); #if the file holds bad content. This happens.
  156.         $self->{headertime}     = DateTime::Format::Mail->format_datetime($self->{dateTime}); #RFC2822
  157.         $self->{inboxtime}      = DateTime::Format::RFC3501->format_datetime($self->{dateTime});
  158.         $self->{subject}        = Utils::getStringBetween($self->{rawContent},'<title>','</title>');       
  159.         $self->{sender}         = Utils::getStringBetween($self->{subject}, $_sep1, $_sep2);
  160.         #$self->{timestamp}     = Utils::getStringBetween($title, $_sep2, $_sep3); #unused, since it's inconsistent
  161.         $self->{receiver}       = Utils::getStringBetween($self->{subject}, $_sep3, $_sep4);
  162.         $self->{chat_protocol}  = Utils::getStringBetween($self->{subject}, $_sep4, $_sep5);
  163.         $self->{body}           = substr($self->{rawContent}, index($self->{rawContent}, '<body>')); #slurp up everything else.
  164.         my $find = quotemeta('</body>'); #cause sometimes Pidgin forgot to add the closing body tag.
  165.         $self->{body} =~ s/$find//g; #let's remove it if it's there.       
  166.         $self->{UIDs}           = [$self->{UID}];
  167.         delete $self->{rawContent};            
  168.         return bless $self, $class;
  169.     }
  170.        
  171.     sub getMailHeaders{
  172.         my $self = shift;
  173.         my $headers = "From: $self->{sender}\r\n";
  174.         $headers .= "To: $self->{receiver}\r\n";   
  175.         $headers .= "Subject: $self->{subject}\r\n";
  176.         $headers .= "Date: $self->{headertime}\r\n";
  177.         $headers .= "MIME-Version: 1.0\r\n";
  178.         $headers .= "Content-Type: text/html; charset=UTF-8\r\n"; #"Content-Type: text/html; charset=iso-8859-1\r\n";
  179.         return $headers;
  180.     }
  181.    
  182.     sub getHTMLBody{
  183.         my $self = shift;
  184.         return '<body>'.$self->{body}.'</body>';
  185.     }
  186.  
  187.     sub getUIDs{
  188.         my $self = shift;
  189.         return $self->{UIDs};
  190.     }
  191.     sub appendLogContent{
  192.         my ($self, $pidginlog) = @_;   
  193.         die Utils::logit("\tINFO: Attempted merge of logs from different senders")  unless $self->{sender} eq $pidginlog->{sender};
  194.         die Utils::logit("\tINFO: Attempted merge of logs to different accounts")   unless $self->{receiver} eq $pidginlog->{receiver};
  195.         die Utils::logit("\tINFO: Attempted merge of logs from different networks") unless $self->{chat_protocol} eq $pidginlog->{chat_protocol};
  196.         die Utils::logit("\tINFO: Attempted unsorted merge of logs")                unless (DateTime->compare($pidginlog->{dateTime}, $self->{dateTime}) > 0);     
  197.         Utils::logit("\tAppending: $pidginlog->{name}");
  198.         $self->{body} .= "\n\n".$pidginlog->{body};        
  199.         push($self->{UIDs}, $pidginlog->{UID});
  200.     }
  201. }
  202.  
  203. package IMAPHelper{
  204.     use strict; use warnings;
  205.     use Net::SSLeay;
  206.     use Mail::IMAPClient;  
  207.     use IO::Socket::SSL;
  208.     use DateTime::Format::Strptime;
  209.  
  210.     ##CONFIG       
  211.     my $_markAsRead = 1; #mark all synced logs as read in inbox.   
  212.     ##CONFIG
  213.  
  214.     ##MEMBERS
  215.     my $_imap = undef;
  216.     my $_sep = '';          # folder hierarchy separator character 
  217.     my $_baseFolder = '';  
  218.     my $_currentFolder = '';
  219.     my $_port = '';
  220.     my $_server = '';  
  221.    
  222.     sub new{
  223.         my ($class, %attrs) = @_;      
  224.         $_port = $attrs{'port'};
  225.         $_server = $attrs{'server'};
  226.         bless \%attrs, $class;
  227.     }
  228.     sub DESTROY {
  229.         my $self = shift;
  230.         if($_imap){
  231.             $self->disconnect();
  232.         }
  233.     }
  234.     sub connect{
  235.         my ($self, $user, $password, $basefolder) = @_;
  236.         if($_imap){
  237.             die "\n\tError: connecting twice on same socket";
  238.         }
  239.         Utils::logit("Logging in '$user'.");
  240.         my $socket = IO::Socket::SSL->new(  
  241.            PeerAddr =>  $_server,  
  242.            PeerPort =>  $_port,
  243.            SSL_verify_mode => 'SSL_VERIFY_NONE'
  244.         )  
  245.         or die "socket(): $@";  
  246.         $_imap = Mail::IMAPClient->new(
  247.             User     => $user,
  248.             Password => $password,
  249.             socket => $socket,
  250.             Uid => 1,
  251.         ) or die ($_imap);
  252.         $_imap->IsAuthenticated() or die Utils::logit("\tINFO: Couldn't Authenticate!");
  253.         $_sep = $_imap->separator;          # Get folder hierarchy separator character
  254.         $_baseFolder = $basefolder;
  255.         if($_imap->is_parent("INBOX")){     # Find out if server accepts subfolders inside INBOX:
  256.             $_baseFolder = "INBOX".$_sep.$_baseFolder; # I'm not sure if this is needed or wanted - it was in the imap demo code so I kept it.
  257.         }
  258.         $_currentFolder = $_baseFolder;
  259.     }
  260.    
  261.     sub disconnect{
  262.         Utils::logit("Logging out.");
  263.         $_imap->logout or die Utils::logit("INFO: Logout error: ". $_imap->LastError);
  264.         $_imap = undef;
  265.     }  
  266.        
  267.     sub selectFolder{
  268.         my ($self, @folders) = @_;     
  269.         my $find = quotemeta($_sep);
  270.         foreach my $a(@folders){           
  271.             $a =~ s/$find/-/g; #replace the separator if it's used in either of the folder names.
  272.         }  
  273.         unshift(@folders, $_baseFolder);       
  274.         my $fullPath = join($_sep, @folders);      
  275.         #Utils::logit("DRYRUN: Selecting '$fullPath'.");       
  276.         if(!$_imap->exists($fullPath)){ #create folder structure, level by level.      
  277.             my $folder = '';
  278.             foreach my $label (@folders){
  279.                 $folder .= $label;
  280.                 unless($_imap->exists($folder)){
  281.                     Utils::logit("Creating '$folder'");
  282.                     if(!$_imap->create($folder)){
  283.                         Utils::logit("\tINFO: Cannot create '$folder'. Server says: ". $_imap->LastError);                     
  284.                         return undef;
  285.                     }
  286.                 }
  287.                 $folder .= $_sep;
  288.             }
  289.             $fullPath = substr($folder, 0, rindex($folder, $_sep)); #remove trailing separator
  290.         }
  291.         if(!$_imap->select($fullPath)){        
  292.             Utils::logit("\tINFO: Cannot select '$fullPath'. Server says: ". $_imap->LastError);
  293.             return undef;
  294.         }      
  295.         $_currentFolder = $fullPath;
  296.         return 1;
  297.     }
  298.    
  299.     sub submit{            
  300.         my ($self, $mailheaders, $htmlbody, $inboxtime) = @_;  
  301.         #return ($self && $mailheaders && $htmlbody && $inboxtime); #DRYRUN
  302.         my $uidort = $_imap->append_string($_currentFolder, $mailheaders."
  303.        
  304.         ".$htmlbody, ($_markAsRead) ? '\Seen' : undef, $inboxtime)
  305.             or warn Utils::logit("INFO: Could not submit mail. Server says: ". $_imap->LastError);
  306.    
  307.         return defined($uidort);
  308.     }
  309.  
  310. }
  311.    
  312. package Pidgin2Mail{
  313.     use strict; use warnings;
  314.     use Net::SSLeay;   
  315.     use File::Find;
  316.     use DateTime::Format::Strptime;
  317.    
  318.     #CONFIG
  319.     my $_logFolder = 'D:\Dropbox\Applications\PidginPortable\Data\settings\.purple\Pidgin2Mail\t\sample-data\\';
  320.     my $_pathLength = length($_logFolder)-length('\sample-data\\'); #for easy substr and cleaner logging.
  321.     my $_log_file_ext = 'html';
  322.     my $_baseFolder = 'Pidgin';         #what label do you want to sort chats under? ('Chats' is reserved)
  323.     my $_user = '';     # username@gmail.com
  324.     my $_password = ''; #use an application specific password, please. http://goo.gl/aCQAx6    
  325.     #CONFIG
  326.    
  327.     ##MEMBERS
  328.     my %_previousRun;  
  329.     my $_chatLogs = undef;
  330.     my $_dirCount = my $_fileCount = 0;
  331.    
  332.     sub getFileContent {       
  333.         my $fh;    
  334.         if(!open($fh, "<", $File::Find::name)){
  335.             my $nicename = substr($File::Find::name, $_pathLength);
  336.             warn Utils::logit("INFO: Couldn't open $nicename: $!");
  337.             return undef;
  338.         }
  339.         my $contents = undef;      
  340.         {
  341.             local $/ = undef;     # Read entire file at once
  342.             $contents = <$fh>;    # Return file as one single `line'
  343.         }
  344.         close $fh;
  345.         return $contents;
  346.     }
  347.    
  348.     sub filter{
  349.         my $nicename = substr($File::Find::dir, $_pathLength);
  350.         my @clean;     
  351.         my $filecount = my $dircount = 0;
  352.         foreach(@_){                       
  353.             next unless -R $_;  #unless readable                       
  354.             next unless -f _ || -d _; #unless file or dir.
  355.             next if ($_ =~ m/^\./); #ignore files/folders beginning with a period
  356.             if(-f _){ #regular file            
  357.                 next unless (my $size = -s _); #does it have a size?
  358.                 next unless ($_ =~ m/([^.]+)$/)[0] eq $_log_file_ext; #correct file extension?
  359.                 next if exists($_previousRun{$_." ($size)"}); #don't add files we've already processed
  360.                 $filecount++;              
  361.             }elsif(-d _){ #dir
  362.                 $dircount++;
  363.             }
  364.             push(@clean, $_);          
  365.         }
  366.         $_fileCount += $filecount;
  367.         $_dirCount += $dircount;
  368.         Utils::logit("'$nicename' contains $filecount new files and $dircount folders to explore.");
  369.         return @clean;
  370.     }
  371.    
  372.     sub readFile {         
  373.         return unless -f $_; #don't read directories                       
  374.         my $size = -s _;
  375.         return unless my $content = getFileContent($_);
  376.         return unless my $dateTime = filenameToDateTime($_);       
  377.         my $log = new PidginLog(
  378.             name        => substr($_, 0, rindex($_, '.')), #name without file ending
  379.             UID         => $_ ." ($size)", #adding (size in bytes) to filename, to make collisions very unlikely.
  380.             rawContent  => $content,
  381.             dateTime    => $dateTime
  382.         );
  383.         if(!$log){
  384.             Utils::logit("\tINFO: $_ seems broken. Ignoring.");
  385.         }else{
  386.             $_chatLogs->addLog($log);
  387.         }
  388.     }
  389.    
  390.     #takes pidgin logfilename (incl. file extension): '2008-11-07.171101+0100CET.html'
  391.     #an RFC-822 date-time
  392.     sub filenameToDateTime{
  393.         my $filename = shift;
  394.         my $clean = substr($filename, 0, rindex($filename, '.'));   #strip file ending
  395.         #$clean = substr($filename, 0, rindex($filename, '+'));     #strip timezone
  396.         #$clean =~ s/\./T/; #replace dot with T for time separation
  397.         my $strp = DateTime::Format::Strptime->new(
  398.            pattern => '%Y-%m-%d.%H%M%S%z' #ignore timezone name %Z, since this buggers out when Pidgin has been stupid (eg: +2ECT, which should be +2ECST)
  399.         );
  400.         my $datetime = $strp->parse_datetime($clean);
  401.         if(!$datetime){
  402.             warn Utils::logit("INFO: Unable to parse datetime from filename: $filename");  
  403.             return undef;
  404.         }      
  405.         return $datetime;
  406.     }
  407.  
  408.     sub main{          
  409.         dbmopen(%_previousRun,$_logFolder,0666) or die("Couldn't create history-file!");       
  410.         my $start = time();
  411.         $_chatLogs = new LogContainer;
  412.         File::Find::find({wanted => \&readFile, preprocess => \&filter}, $_logFolder);                                             
  413.         $_chatLogs->merge();   
  414.         my $count = $_chatLogs->getCount();
  415.         my $gmail = new IMAPHelper('port' => '993', 'server' => 'imap.gmail.com'); #CONFIG         
  416.         $gmail->connect($_user, $_password, $_baseFolder);     
  417.         my @buckets = $_chatLogs->getBuckets(); #buckets are sorted!
  418.         my $currentFolders = 'mingegurgle';
  419.         foreach my $bucket(@buckets){ # $bucket = "$type $receiver $sender $ymd"                                       
  420.             if(index($bucket, $currentFolders) != 0){ #only call selectFolder when we move into a new folder.
  421.                 my($protocol, $receiver, $rest) = split(" ", $bucket, 3);          
  422.                 $gmail->selectFolder(($protocol, $receiver)) or next;
  423.                 $currentFolders = "$protocol $receiver";               
  424.             }          
  425.             foreach my $pidginlog ($_chatLogs->get($bucket)){
  426.                 Utils::logit("\tSubmitting... $count","\r\n");
  427.                 if($gmail->submit($pidginlog->getMailHeaders(), $pidginlog->getHTMLBody(), $pidginlog->{inboxtime})){                  
  428.                     foreach my $uid (@{$pidginlog->{UIDs}}){ #add all UID to the list.                     
  429.                         $_previousRun{$uid} = 1;
  430.                     }                  
  431.                     $count--;              
  432.                 }else{
  433.                     Utils::logit("INFO: Unable to submit " . $pidginlog->{name} ."!");                 
  434.                 }
  435.             }          
  436.         }
  437.         Utils::logit("INFO: $count logfiles left to submit! Check output and see if anything caused problems.");               
  438.         $gmail->disconnect();              
  439.         dbmclose(%_previousRun);               
  440.         my $run_time = time() - $start;
  441.         Utils::logit("INFO: Processed $_fileCount files in $_dirCount folders in...");             
  442.         Utils::logit("INFO:\t". int($run_time /(24*60*60)) ."days " . ($run_time/(60*60))%24 . "hours " . ($run_time /60)%60 . "mins " . $run_time%60 . "secs");       
  443.         return 0;
  444.     }  
  445.     exit(main(@ARGV));
  446. };
RAW Paste Data