ulfben

Pidgin2Mail

Feb 3rd, 2014
165
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #Pidgin2Mail parses Pidgin HTML chatlogs, and uploads them to an email server.
  2. #Pidgin creates a new log file every time you open a chat window, so many short logs are created if you often close your chat windows.
  3. #This script will merge all files per day, so each contact generates 1 email/day, instead of 1 email per HTML log file.
  4.  
  5. #There's 6 classes in this document, sorry. Search for "CONFIG" and adjust as needed.
  6. #It boils down to: a path to output a  log-file. A path to your chat-logs, your user account info for your mail, and you mail server's adress
  7.  
  8. #Before running it, take a look at your Pidgin log-files. If the content of the <title>-tag doesn't match
  9. #mine (say, you're running Pidgin in a different language), adjust these in the script too.
  10.  
  11. # //Ulf Benjaminsson, ulfben.com, ulfben at gmail dot com
  12.  
  13.  
  14. package Utils{
  15.     use strict; use warnings;
  16.     sub Utils::getStringBetween{
  17.         #($title) = $str =~ m/ '<title>' (.*) '</title>' /;
  18.         my $str = $_[0];
  19.         my $start = $_[1];
  20.         my $end = $_[2];
  21.         my $startpos = length($start) + index($str, $start);
  22.         my $endpos = index($str, $end, $startpos) - length($str); #rindex($str, $end)
  23.         my $result = substr($str, $startpos, $endpos) or warn Utils::logit("\t\t\tINFO: Error in getStringBetween:\n\t\tStart: '$start'\n\t\tEnd: '$end'\n\t '$str'");
  24.         return $result;
  25.     }
  26.  
  27.     my $logdir='D:\Dropbox\Applications\PidginPortable\Data\settings\.purple\Pidgin2Mail\\'; #CONFIG
  28.     sub Utils::logit
  29.     {
  30.         my $s = shift;
  31.         my $eol = shift;
  32.         $eol = $eol ? $eol : "\n";
  33.         my ($logsec,$logmin,$loghour,$logmday,$logmon,$logyear,$logwday,$logyday,$logisdst)=localtime(time);
  34.         my $logtimestamp = sprintf("%4d-%02d-%02d %02d:%02d:%02d",$logyear+1900,$logmon+1,$logmday,$loghour,$logmin,$logsec);      
  35.         my $logfile = sprintf("%s%4d-%02d-%02d.%s",$logdir,$logyear+1900,$logmon+1,$logmday,'txt');            
  36.         my $fh;
  37.         open($fh, '>>', "$logfile") or warn "$logfile: $!";
  38.             print $fh "$logtimestamp\t $s\n";          
  39.         close($fh);
  40.         print "$logtimestamp\t $s $eol";
  41.         return "$logtimestamp\t $s $eol";
  42.     }
  43. }
  44.  
  45. #Holds all instances of PidginLogs, sorts them and, optionally, merges them into daily batches.
  46. package LogContainer{
  47.     use strict; use warnings;
  48.     my %_logs = ();
  49.     my @_hashes = ();
  50.    
  51.     sub new{
  52.         my ($class, %attrs) = @_;      
  53.         bless \%attrs, $class;
  54.     }
  55.    
  56.     sub addLog{
  57.         my ($self, $pidginlog) = @_;       
  58.         my $type = $pidginlog->{chat_protocol};
  59.         my $receiver = $pidginlog->{receiver};
  60.         my $sender = $pidginlog->{sender};     
  61.         my $ymd = $pidginlog->{dateTime}->ymd('-'); # yyyy-mm-dd
  62.         my $name = $pidginlog->{name}; # 2007-08-28.130702+0200CEST assumed unique
  63.         my $bucket = "$type $receiver $sender $ymd"; #assuming whitespace will never appear in either of these.
  64.         if(!$_logs{$bucket}){
  65.             $_logs{$bucket} = ();          
  66.             push(@_hashes, $bucket);           
  67.         }
  68.         if($_logs{$bucket}{$name}){
  69.             die Utils::logit("INFO: $name is not unique, at LogContainer::addLog");
  70.         }
  71.         $_logs{$bucket}{$name} = $pidginlog;       
  72.     }
  73.    
  74.     sub merge{
  75.         my $self = shift;
  76.         my $count = $self->getCount();
  77.         Utils::logit("INFO: Starting merge with $count files in ".@_hashes." buckets.");
  78.         @_hashes = sort { lc($a) cmp lc($b) } @_hashes; #sorting hashes, just to keep the logs readable.
  79.         foreach my $bucket(@_hashes){
  80.             my %logs = %{$_logs{$bucket}};         
  81.             my @keys = $self->getSortedKeys($bucket) or next;                      
  82.             if(@keys < 2){ next; }         
  83.             Utils::logit(@keys." logs in '$bucket' before merge.");            
  84.             my $first = $logs{shift @keys};                    
  85.             foreach my $logname(@keys){            
  86.                 $first->appendLogContent($logs{$logname});             
  87.                 delete $_logs{$bucket}{$logname};                          
  88.             }          
  89.         }  
  90.         $count = $self->getCount();    
  91.         Utils::logit("INFO: Merge completed. There's now $count logs in ".@_hashes." buckets.");
  92.     }
  93.    
  94.     sub getSortedKeys{
  95.         my ($self, $bucket) = @_;                  
  96.         return sort { lc($a) cmp lc($b) } keys %{$_logs{$bucket}};             
  97.     }
  98.    
  99.     sub get{
  100.         my ($self, $bucket) = @_;
  101.         my @result = ();
  102.         my @keys = $self->getSortedKeys($bucket) or Utils::logit("INFO: Invalid bucket '$bucket' in ChatLogs::get");
  103.         foreach my $logname(@keys){            
  104.             Utils::logit("INFO: Invalid logname '$logname' in ChatLogs::get ") unless $_logs{$bucket}{$logname};
  105.             push(@result, $_logs{$bucket}{$logname});          
  106.         }      
  107.         return @result;
  108.     }
  109.        
  110.     sub getBuckets{    
  111.         return @_hashes;
  112.     }
  113.  
  114.     sub getCount{
  115.         my $count = 0;
  116.         foreach my $bucket(@_hashes){
  117.             $count += keys %{$_logs{$bucket}};
  118.         }
  119.         return $count;
  120.     }
  121. }
  122.  
  123. package PidginLog{
  124.     use strict; use warnings;
  125.     use DateTime::Format::Mail;
  126.     use DateTime::Format::RFC3501;
  127.     use feature qw(switch);
  128.     #   MAYBE CONFIG
  129.     #       if your Pidgin logs in another language, make sure these match the <title>-tag of your chat logs.
  130.     my $_sep1 = 'Conversation with ';
  131.     my $_sep2 = ' at ';
  132.     my $_sep3 = ' on ';
  133.     my $_sep4 = ' (';
  134.     my $_sep5 = ')';
  135.     #   CONFIG
  136.    
  137.     my $name;               #'2007-08-28.121917+0200CEST'
  138.     my $UID;                #an identifier of each file, to ensure we don't parse or submit stuff we've already done.
  139.     my @UIDs;               #identifiers of files that were merged with this log.
  140.     my $rawContent;         #full file content
  141.     my $subject;            #'Conversation with x@x.com at 2007-08-28 12:19:17 on y@y.com (msn)'
  142.     my $body;               #all html between <body>-tags
  143.     my $sender;             #'x@x.com'
  144.     my $timestamp;          #timestamp from the log text. Inconsistent, and thus - unused.
  145.     my $receiver;           #'y@y.com'
  146.     my $chat_protocol;      #'msn'
  147.     my $dateTime;           # DateTime-object. '%Y-%m-%d.%H%M%S%z'  2007-08-28T12:19:17
  148.     my $headertime;         #'Tue, 28 Aug 2007 12:19:17 +0200'
  149.     my $inboxtime;          #'28-Aug-2007 12:19:17 +0200'
  150.    
  151.     sub new {
  152.         my ($class) = shift;
  153.         my $self  = { @_ };    
  154.         return undef unless defined $self->{name} and defined $self->{rawContent} and defined $self->{dateTime} and defined $self->{UID};                  
  155.         return undef unless (index($self->{rawContent}, '<title>') >= 0) && (index($self->{rawContent}, '</title>') >= 0); #if the file holds bad content. This happens.
  156.         $self->{headertime}     = DateTime::Format::Mail->format_datetime($self->{dateTime}); #RFC2822
  157.         $self->{inboxtime}      = DateTime::Format::RFC3501->format_datetime($self->{dateTime});
  158.         $self->{subject}        = Utils::getStringBetween($self->{rawContent},'<title>','</title>');       
  159.         $self->{sender}         = Utils::getStringBetween($self->{subject}, $_sep1, $_sep2);
  160.         #$self->{timestamp}     = Utils::getStringBetween($title, $_sep2, $_sep3); #unused, since it's inconsistent
  161.         $self->{receiver}       = Utils::getStringBetween($self->{subject}, $_sep3, $_sep4);
  162.         $self->{chat_protocol}  = Utils::getStringBetween($self->{subject}, $_sep4, $_sep5);
  163.         $self->{body}           = substr($self->{rawContent}, index($self->{rawContent}, '<body>')); #slurp up everything else.
  164.         my $find = quotemeta('</body>'); #cause sometimes Pidgin forgot to add the closing body tag.
  165.         $self->{body} =~ s/$find//g; #let's remove it if it's there.       
  166.         $self->{UIDs}           = [$self->{UID}];
  167.         delete $self->{rawContent};            
  168.         return bless $self, $class;
  169.     }
  170.        
  171.     sub getMailHeaders{
  172.         my $self = shift;
  173.         my $headers = "From: $self->{sender}\r\n";
  174.         $headers .= "To: $self->{receiver}\r\n";   
  175.         $headers .= "Subject: $self->{subject}\r\n";
  176.         $headers .= "Date: $self->{headertime}\r\n";
  177.         $headers .= "MIME-Version: 1.0\r\n";
  178.         $headers .= "Content-Type: text/html; charset=UTF-8\r\n"; #"Content-Type: text/html; charset=iso-8859-1\r\n";
  179.         return $headers;
  180.     }
  181.    
  182.     sub getHTMLBody{
  183.         my $self = shift;
  184.         return '<body>'.$self->{body}.'</body>';
  185.     }
  186.  
  187.     sub getUIDs{
  188.         my $self = shift;
  189.         return $self->{UIDs};
  190.     }
  191.     sub appendLogContent{
  192.         my ($self, $pidginlog) = @_;   
  193.         die Utils::logit("\tINFO: Attempted merge of logs from different senders")  unless $self->{sender} eq $pidginlog->{sender};
  194.         die Utils::logit("\tINFO: Attempted merge of logs to different accounts")   unless $self->{receiver} eq $pidginlog->{receiver};
  195.         die Utils::logit("\tINFO: Attempted merge of logs from different networks") unless $self->{chat_protocol} eq $pidginlog->{chat_protocol};
  196.         die Utils::logit("\tINFO: Attempted unsorted merge of logs")                unless (DateTime->compare($pidginlog->{dateTime}, $self->{dateTime}) > 0);     
  197.         Utils::logit("\tAppending: $pidginlog->{name}");
  198.         $self->{body} .= "\n\n".$pidginlog->{body};        
  199.         push($self->{UIDs}, $pidginlog->{UID});
  200.     }
  201. }
  202.  
  203. package IMAPHelper{
  204.     use strict; use warnings;
  205.     use Net::SSLeay;
  206.     use Mail::IMAPClient;  
  207.     use IO::Socket::SSL;
  208.     use DateTime::Format::Strptime;
  209.  
  210.     ##CONFIG       
  211.     my $_markAsRead = 1; #mark all synced logs as read in inbox.   
  212.     ##CONFIG
  213.  
  214.     ##MEMBERS
  215.     my $_imap = undef;
  216.     my $_sep = '';          # folder hierarchy separator character 
  217.     my $_baseFolder = '';  
  218.     my $_currentFolder = '';
  219.     my $_port = '';
  220.     my $_server = '';  
  221.    
  222.     sub new{
  223.         my ($class, %attrs) = @_;      
  224.         $_port = $attrs{'port'};
  225.         $_server = $attrs{'server'};
  226.         bless \%attrs, $class;
  227.     }
  228.     sub DESTROY {
  229.         my $self = shift;
  230.         if($_imap){
  231.             $self->disconnect();
  232.         }
  233.     }
  234.     sub connect{
  235.         my ($self, $user, $password, $basefolder) = @_;
  236.         if($_imap){
  237.             die "\n\tError: connecting twice on same socket";
  238.         }
  239.         Utils::logit("Logging in '$user'.");
  240.         my $socket = IO::Socket::SSL->new(  
  241.            PeerAddr =>  $_server,  
  242.            PeerPort =>  $_port,
  243.            SSL_verify_mode => 'SSL_VERIFY_NONE'
  244.         )  
  245.         or die "socket(): $@";  
  246.         $_imap = Mail::IMAPClient->new(
  247.             User     => $user,
  248.             Password => $password,
  249.             socket => $socket,
  250.             Uid => 1,
  251.         ) or die ($_imap);
  252.         $_imap->IsAuthenticated() or die Utils::logit("\tINFO: Couldn't Authenticate!");
  253.         $_sep = $_imap->separator;          # Get folder hierarchy separator character
  254.         $_baseFolder = $basefolder;
  255.         if($_imap->is_parent("INBOX")){     # Find out if server accepts subfolders inside INBOX:
  256.             $_baseFolder = "INBOX".$_sep.$_baseFolder; # I'm not sure if this is needed or wanted - it was in the imap demo code so I kept it.
  257.         }
  258.         $_currentFolder = $_baseFolder;
  259.     }
  260.    
  261.     sub disconnect{
  262.         Utils::logit("Logging out.");
  263.         $_imap->logout or die Utils::logit("INFO: Logout error: ". $_imap->LastError);
  264.         $_imap = undef;
  265.     }  
  266.        
  267.     sub selectFolder{
  268.         my ($self, @folders) = @_;     
  269.         my $find = quotemeta($_sep);
  270.         foreach my $a(@folders){           
  271.             $a =~ s/$find/-/g; #replace the separator if it's used in either of the folder names.
  272.         }  
  273.         unshift(@folders, $_baseFolder);       
  274.         my $fullPath = join($_sep, @folders);      
  275.         #Utils::logit("DRYRUN: Selecting '$fullPath'.");       
  276.         if(!$_imap->exists($fullPath)){ #create folder structure, level by level.      
  277.             my $folder = '';
  278.             foreach my $label (@folders){
  279.                 $folder .= $label;
  280.                 unless($_imap->exists($folder)){
  281.                     Utils::logit("Creating '$folder'");
  282.                     if(!$_imap->create($folder)){
  283.                         Utils::logit("\tINFO: Cannot create '$folder'. Server says: ". $_imap->LastError);                     
  284.                         return undef;
  285.                     }
  286.                 }
  287.                 $folder .= $_sep;
  288.             }
  289.             $fullPath = substr($folder, 0, rindex($folder, $_sep)); #remove trailing separator
  290.         }
  291.         if(!$_imap->select($fullPath)){        
  292.             Utils::logit("\tINFO: Cannot select '$fullPath'. Server says: ". $_imap->LastError);
  293.             return undef;
  294.         }      
  295.         $_currentFolder = $fullPath;
  296.         return 1;
  297.     }
  298.    
  299.     sub submit{            
  300.         my ($self, $mailheaders, $htmlbody, $inboxtime) = @_;  
  301.         #return ($self && $mailheaders && $htmlbody && $inboxtime); #DRYRUN
  302.         my $uidort = $_imap->append_string($_currentFolder, $mailheaders."
  303.        
  304.         ".$htmlbody, ($_markAsRead) ? '\Seen' : undef, $inboxtime)
  305.             or warn Utils::logit("INFO: Could not submit mail. Server says: ". $_imap->LastError);
  306.    
  307.         return defined($uidort);
  308.     }
  309.  
  310. }
  311.    
  312. package Pidgin2Mail{
  313.     use strict; use warnings;
  314.     use Net::SSLeay;   
  315.     use File::Find;
  316.     use DateTime::Format::Strptime;
  317.    
  318.     #CONFIG
  319.     my $_logFolder = 'D:\Dropbox\Applications\PidginPortable\Data\settings\.purple\Pidgin2Mail\t\sample-data\\';
  320.     my $_pathLength = length($_logFolder)-length('\sample-data\\'); #for easy substr and cleaner logging.
  321.     my $_log_file_ext = 'html';
  322.     my $_baseFolder = 'Pidgin';         #what label do you want to sort chats under? ('Chats' is reserved)
  323.     my $_user = '';     # username@gmail.com
  324.     my $_password = ''; #use an application specific password, please. http://goo.gl/aCQAx6    
  325.     #CONFIG
  326.    
  327.     ##MEMBERS
  328.     my %_previousRun;  
  329.     my $_chatLogs = undef;
  330.     my $_dirCount = my $_fileCount = 0;
  331.    
  332.     sub getFileContent {       
  333.         my $fh;    
  334.         if(!open($fh, "<", $File::Find::name)){
  335.             my $nicename = substr($File::Find::name, $_pathLength);
  336.             warn Utils::logit("INFO: Couldn't open $nicename: $!");
  337.             return undef;
  338.         }
  339.         my $contents = undef;      
  340.         {
  341.             local $/ = undef;     # Read entire file at once
  342.             $contents = <$fh>;    # Return file as one single `line'
  343.         }
  344.         close $fh;
  345.         return $contents;
  346.     }
  347.    
  348.     sub filter{
  349.         my $nicename = substr($File::Find::dir, $_pathLength);
  350.         my @clean;     
  351.         my $filecount = my $dircount = 0;
  352.         foreach(@_){                       
  353.             next unless -R $_;  #unless readable                       
  354.             next unless -f _ || -d _; #unless file or dir.
  355.             next if ($_ =~ m/^\./); #ignore files/folders beginning with a period
  356.             if(-f _){ #regular file            
  357.                 next unless (my $size = -s _); #does it have a size?
  358.                 next unless ($_ =~ m/([^.]+)$/)[0] eq $_log_file_ext; #correct file extension?
  359.                 next if exists($_previousRun{$_." ($size)"}); #don't add files we've already processed
  360.                 $filecount++;              
  361.             }elsif(-d _){ #dir
  362.                 $dircount++;
  363.             }
  364.             push(@clean, $_);          
  365.         }
  366.         $_fileCount += $filecount;
  367.         $_dirCount += $dircount;
  368.         Utils::logit("'$nicename' contains $filecount new files and $dircount folders to explore.");
  369.         return @clean;
  370.     }
  371.    
  372.     sub readFile {         
  373.         return unless -f $_; #don't read directories                       
  374.         my $size = -s _;
  375.         return unless my $content = getFileContent($_);
  376.         return unless my $dateTime = filenameToDateTime($_);       
  377.         my $log = new PidginLog(
  378.             name        => substr($_, 0, rindex($_, '.')), #name without file ending
  379.             UID         => $_ ." ($size)", #adding (size in bytes) to filename, to make collisions very unlikely.
  380.             rawContent  => $content,
  381.             dateTime    => $dateTime
  382.         );
  383.         if(!$log){
  384.             Utils::logit("\tINFO: $_ seems broken. Ignoring.");
  385.         }else{
  386.             $_chatLogs->addLog($log);
  387.         }
  388.     }
  389.    
  390.     #takes pidgin logfilename (incl. file extension): '2008-11-07.171101+0100CET.html'
  391.     #an RFC-822 date-time
  392.     sub filenameToDateTime{
  393.         my $filename = shift;
  394.         my $clean = substr($filename, 0, rindex($filename, '.'));   #strip file ending
  395.         #$clean = substr($filename, 0, rindex($filename, '+'));     #strip timezone
  396.         #$clean =~ s/\./T/; #replace dot with T for time separation
  397.         my $strp = DateTime::Format::Strptime->new(
  398.            pattern => '%Y-%m-%d.%H%M%S%z' #ignore timezone name %Z, since this buggers out when Pidgin has been stupid (eg: +2ECT, which should be +2ECST)
  399.         );
  400.         my $datetime = $strp->parse_datetime($clean);
  401.         if(!$datetime){
  402.             warn Utils::logit("INFO: Unable to parse datetime from filename: $filename");  
  403.             return undef;
  404.         }      
  405.         return $datetime;
  406.     }
  407.  
  408.     sub main{          
  409.         dbmopen(%_previousRun,$_logFolder,0666) or die("Couldn't create history-file!");       
  410.         my $start = time();
  411.         $_chatLogs = new LogContainer;
  412.         File::Find::find({wanted => \&readFile, preprocess => \&filter}, $_logFolder);                                             
  413.         $_chatLogs->merge();   
  414.         my $count = $_chatLogs->getCount();
  415.         my $gmail = new IMAPHelper('port' => '993', 'server' => 'imap.gmail.com'); #CONFIG         
  416.         $gmail->connect($_user, $_password, $_baseFolder);     
  417.         my @buckets = $_chatLogs->getBuckets(); #buckets are sorted!
  418.         my $currentFolders = 'mingegurgle';
  419.         foreach my $bucket(@buckets){ # $bucket = "$type $receiver $sender $ymd"                                       
  420.             if(index($bucket, $currentFolders) != 0){ #only call selectFolder when we move into a new folder.
  421.                 my($protocol, $receiver, $rest) = split(" ", $bucket, 3);          
  422.                 $gmail->selectFolder(($protocol, $receiver)) or next;
  423.                 $currentFolders = "$protocol $receiver";               
  424.             }          
  425.             foreach my $pidginlog ($_chatLogs->get($bucket)){
  426.                 Utils::logit("\tSubmitting... $count","\r\n");
  427.                 if($gmail->submit($pidginlog->getMailHeaders(), $pidginlog->getHTMLBody(), $pidginlog->{inboxtime})){                  
  428.                     foreach my $uid (@{$pidginlog->{UIDs}}){ #add all UID to the list.                     
  429.                         $_previousRun{$uid} = 1;
  430.                     }                  
  431.                     $count--;              
  432.                 }else{
  433.                     Utils::logit("INFO: Unable to submit " . $pidginlog->{name} ."!");                 
  434.                 }
  435.             }          
  436.         }
  437.         Utils::logit("INFO: $count logfiles left to submit! Check output and see if anything caused problems.");               
  438.         $gmail->disconnect();              
  439.         dbmclose(%_previousRun);               
  440.         my $run_time = time() - $start;
  441.         Utils::logit("INFO: Processed $_fileCount files in $_dirCount folders in...");             
  442.         Utils::logit("INFO:\t". int($run_time /(24*60*60)) ."days " . ($run_time/(60*60))%24 . "hours " . ($run_time /60)%60 . "mins " . $run_time%60 . "secs");       
  443.         return 0;
  444.     }  
  445.     exit(main(@ARGV));
  446. };
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×