Advertisement
techcarlin

Python - Gmail Save

Aug 11th, 2024
1,802
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.41 KB | None | 0 0
  1. import imaplib
  2. import email
  3. from email.header import decode_header
  4. import os
  5. import ssl
  6. import sys
  7. import getpass
  8. from datetime import datetime
  9. import json
  10. import logging
  11. import chardet
  12.  
  13. # Set up logging
  14. logging.basicConfig(filename='email_download.log', level=logging.INFO,
  15.                     format='%(asctime)s - %(levelname)s - %(message)s')
  16.  
  17. def clean_filename(filename):
  18.     return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_')).rstrip()
  19.  
  20. def decode_string(string, encoding):
  21.     if isinstance(string, bytes):
  22.         if not encoding:
  23.             encoding = chardet.detect(string)['encoding']
  24.         return string.decode(encoding or 'utf-8', errors='replace')
  25.     return string
  26.  
  27. def save_email_content(msg, folder_path):
  28.     try:
  29.         body = ""
  30.         if msg.is_multipart():
  31.             for part in msg.walk():
  32.                 ctype = part.get_content_type()
  33.                 cdispo = str(part.get('Content-Disposition'))
  34.                 if ctype == 'text/plain' and 'attachment' not in cdispo:
  35.                     body = decode_string(part.get_payload(decode=True), part.get_content_charset())
  36.                     break
  37.         else:
  38.             body = decode_string(msg.get_payload(decode=True), msg.get_content_charset())
  39.        
  40.         with open(os.path.join(folder_path, 'email_content.txt'), 'w', encoding='utf-8') as f:
  41.             f.write(f"Subject: {msg['Subject']}\n")
  42.             f.write(f"From: {msg['From']}\n")
  43.             f.write(f"To: {msg['To']}\n")
  44.             f.write(f"Date: {msg['Date']}\n\n")
  45.             f.write(body)
  46.         return True
  47.     except Exception as e:
  48.         logging.error(f"Error saving email content: {e}")
  49.         return False
  50.  
  51. def save_attachment(part, folder_path):
  52.     filename = part.get_filename()
  53.     if filename:
  54.         filename = decode_string(decode_header(filename)[0][0], decode_header(filename)[0][1])
  55.         filepath = os.path.join(folder_path, clean_filename(filename))
  56.         with open(filepath, "wb") as f:
  57.             f.write(part.get_payload(decode=True))
  58.         logging.info(f"Attachment saved: {filename}")
  59.         return True
  60.     return False
  61.  
  62. def connect_to_gmail(email, password):
  63.     print("Connecting to Gmail...")
  64.     context = ssl.create_default_context()
  65.     try:
  66.         mail = imaplib.IMAP4_SSL("imap.gmail.com", ssl_context=context)
  67.         mail.login(email, password)
  68.         print("Successfully connected to Gmail.")
  69.         return mail
  70.     except imaplib.IMAP4.error as e:
  71.         print(f"Error connecting to Gmail: {e}")
  72.         logging.error(f"Error connecting to Gmail: {e}")
  73.         raise
  74.  
  75. def process_emails(mail, email_address, password, base_dir, start_index, batch_size):
  76.     print(f"Starting to process emails from index {start_index}")
  77.     mail.select("inbox")
  78.     status, messages = mail.search(None, "ALL")
  79.     email_ids = messages[0].split()
  80.    
  81.     total_emails = len(email_ids)
  82.     end_index = min(start_index + batch_size, total_emails)
  83.    
  84.     processed_emails = 0
  85.     skipped_emails = 0
  86.  
  87.     for index in range(start_index, end_index):
  88.         if processed_emails >= batch_size:
  89.             print(f"Batch size of {batch_size} reached. Stopping processing.")
  90.             logging.info(f"Batch size of {batch_size} reached. Stopping processing.")
  91.             break
  92.  
  93.         email_id = email_ids[index]
  94.         try:
  95.             status, msg_data = mail.fetch(email_id, "(RFC822)")
  96.             if status != 'OK':
  97.                 print(f"Failed to fetch email {email_id}: {status}")
  98.                 logging.warning(f"Failed to fetch email {email_id}: {status}")
  99.                 skipped_emails += 1
  100.                 continue
  101.  
  102.             for response_part in msg_data:
  103.                 if isinstance(response_part, tuple):
  104.                     email_body = response_part[1]
  105.                     try:
  106.                         msg = email.message_from_bytes(email_body)
  107.                     except Exception as e:
  108.                         print(f"Error parsing email {email_id}: {e}")
  109.                         logging.error(f"Error parsing email {email_id}: {e}")
  110.                         skipped_emails += 1
  111.                         continue
  112.  
  113.                     subject, encoding = decode_header(msg["Subject"])[0]
  114.                     subject = decode_string(subject, encoding)
  115.                     date_str = msg.get("Date")
  116.                     try:
  117.                         date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
  118.                     except ValueError:
  119.                         date = datetime.now()
  120.                    
  121.                     folder_name = clean_filename(f"{date.strftime('%Y-%m-%d_%H-%M-%S')}_{subject[:30]}")
  122.                     folder_path = os.path.join(base_dir, folder_name)
  123.                     os.makedirs(folder_path, exist_ok=True)
  124.  
  125.                     print(f"Processing: {subject}")
  126.                     if save_email_content(msg, folder_path):
  127.                         attachments_saved = 0
  128.                         for part in msg.walk():
  129.                             if part.get_content_maintype() == "multipart":
  130.                                 continue
  131.                             if part.get("Content-Disposition") is None:
  132.                                 continue
  133.                             if save_attachment(part, folder_path):
  134.                                 attachments_saved += 1
  135.                         processed_emails += 1
  136.                         print(f"Email processed. Attachments saved: {attachments_saved}")
  137.                     else:
  138.                         skipped_emails += 1
  139.                         print(f"Failed to save email content for: {subject}")
  140.  
  141.         except Exception as e:
  142.             print(f"Error processing email {email_id}: {e}")
  143.             logging.error(f"Error processing email {email_id}: {e}")
  144.             skipped_emails += 1
  145.  
  146.         if processed_emails % 10 == 0:
  147.             print(f"Current batch progress: {processed_emails}/{batch_size} emails processed")
  148.             print(f"Overall Progress: {start_index + processed_emails}/{total_emails} emails processed")
  149.             logging.info(f"Current batch progress: {processed_emails}/{batch_size} emails processed")
  150.             logging.info(f"Overall Progress: {start_index + processed_emails}/{total_emails} emails processed")
  151.  
  152.     last_processed = start_index + processed_emails
  153.     print(f"Batch complete. Processed {processed_emails} emails in this batch.")
  154.     print(f"Total progress: {last_processed}/{total_emails} emails processed")
  155.     logging.info(f"Batch complete. Processed {processed_emails} emails in this batch.")
  156.     logging.info(f"Total progress: {last_processed}/{total_emails} emails processed")
  157.  
  158.     return total_emails, processed_emails, skipped_emails, last_processed
  159.  
  160. def save_progress(progress_file, last_processed, total_emails):
  161.     with open(progress_file, 'w') as f:
  162.         json.dump({
  163.             'last_processed': last_processed,
  164.             'total_emails': total_emails
  165.         }, f)
  166.     print(f"Progress saved to: {progress_file}")
  167.     logging.info(f"Progress saved to: {progress_file}")
  168.  
  169. def load_progress(progress_file):
  170.     if os.path.exists(progress_file):
  171.         with open(progress_file, 'r') as f:
  172.             data = json.load(f)
  173.             return data.get('last_processed', 0), data.get('total_emails', 0)
  174.     return 0, 0
  175.  
  176. def main():
  177.     print("Starting Gmail Email Downloader")
  178.     EMAIL = "[email protected]"
  179.     PASSWORD = getpass.getpass("Enter your Gmail password or app password: ")
  180.  
  181.     if not EMAIL or not PASSWORD:
  182.         print("Error: Email or password not provided.")
  183.         logging.error("Error: Email or password not provided.")
  184.         sys.exit(1)
  185.  
  186.     base_dir = "c:\\emails"
  187.     os.makedirs(base_dir, exist_ok=True)
  188.     print(f"Saving emails to: {base_dir}")
  189.  
  190.     progress_file = os.path.join(base_dir, 'progress.json')
  191.     print(f"Progress file location: {progress_file}")
  192.     logging.info(f"Progress file location: {progress_file}")
  193.  
  194.     start_index, total_emails_saved = load_progress(progress_file)
  195.     print(f"Resuming from email index: {start_index}")
  196.     logging.info(f"Resuming from email index: {start_index}")
  197.  
  198.     batch_size = 5000  # Process 5000 emails at a time
  199.  
  200.     try:
  201.         mail = connect_to_gmail(EMAIL, PASSWORD)
  202.         total_emails, processed_emails, skipped_emails, last_processed = process_emails(mail, EMAIL, PASSWORD, base_dir, start_index, batch_size)
  203.         mail.logout()
  204.         print("Logged out from Gmail.")
  205.  
  206.         save_progress(progress_file, last_processed, total_emails)
  207.  
  208.         print("\nBatch processing completed.")
  209.         print(f"Total emails in inbox: {total_emails}")
  210.         print(f"Emails processed so far: {last_processed}")
  211.         print(f"Emails processed in this batch: {processed_emails}")
  212.         print(f"Emails skipped in this batch: {skipped_emails}")
  213.         print(f"Emails remaining: {total_emails - last_processed}")
  214.  
  215.         if last_processed < total_emails:
  216.             print(f"\nThere are more emails to process. Run the script again to continue.")
  217.             print(f"Overall Progress: {last_processed / total_emails * 100:.2f}% complete")
  218.         else:
  219.             print("\nAll emails have been processed!")
  220.  
  221.     except Exception as e:
  222.         print(f"An unexpected error occurred: {e}")
  223.         logging.error(f"An unexpected error occurred: {e}")
  224.  
  225.     print("Script execution completed.")
  226.  
  227. if __name__ == "__main__":
  228.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement