Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import imaplib
- import email
- from email.header import decode_header
- import os
- import ssl
- import sys
- import getpass
- from datetime import datetime
- import json
- import logging
- import chardet
- # Set up logging
- logging.basicConfig(filename='email_download.log', level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s')
- def clean_filename(filename):
- return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_')).rstrip()
- def decode_string(string, encoding):
- if isinstance(string, bytes):
- if not encoding:
- encoding = chardet.detect(string)['encoding']
- return string.decode(encoding or 'utf-8', errors='replace')
- return string
- def save_email_content(msg, folder_path):
- try:
- body = ""
- if msg.is_multipart():
- for part in msg.walk():
- ctype = part.get_content_type()
- cdispo = str(part.get('Content-Disposition'))
- if ctype == 'text/plain' and 'attachment' not in cdispo:
- body = decode_string(part.get_payload(decode=True), part.get_content_charset())
- break
- else:
- body = decode_string(msg.get_payload(decode=True), msg.get_content_charset())
- with open(os.path.join(folder_path, 'email_content.txt'), 'w', encoding='utf-8') as f:
- f.write(f"Subject: {msg['Subject']}\n")
- f.write(f"From: {msg['From']}\n")
- f.write(f"To: {msg['To']}\n")
- f.write(f"Date: {msg['Date']}\n\n")
- f.write(body)
- return True
- except Exception as e:
- logging.error(f"Error saving email content: {e}")
- return False
- def save_attachment(part, folder_path):
- filename = part.get_filename()
- if filename:
- filename = decode_string(decode_header(filename)[0][0], decode_header(filename)[0][1])
- filepath = os.path.join(folder_path, clean_filename(filename))
- with open(filepath, "wb") as f:
- f.write(part.get_payload(decode=True))
- logging.info(f"Attachment saved: {filename}")
- return True
- return False
- def connect_to_gmail(email, password):
- print("Connecting to Gmail...")
- context = ssl.create_default_context()
- try:
- mail = imaplib.IMAP4_SSL("imap.gmail.com", ssl_context=context)
- mail.login(email, password)
- print("Successfully connected to Gmail.")
- return mail
- except imaplib.IMAP4.error as e:
- print(f"Error connecting to Gmail: {e}")
- logging.error(f"Error connecting to Gmail: {e}")
- raise
- def process_emails(mail, email_address, password, base_dir, start_index, batch_size):
- print(f"Starting to process emails from index {start_index}")
- mail.select("inbox")
- status, messages = mail.search(None, "ALL")
- email_ids = messages[0].split()
- total_emails = len(email_ids)
- end_index = min(start_index + batch_size, total_emails)
- processed_emails = 0
- skipped_emails = 0
- for index in range(start_index, end_index):
- if processed_emails >= batch_size:
- print(f"Batch size of {batch_size} reached. Stopping processing.")
- logging.info(f"Batch size of {batch_size} reached. Stopping processing.")
- break
- email_id = email_ids[index]
- try:
- status, msg_data = mail.fetch(email_id, "(RFC822)")
- if status != 'OK':
- print(f"Failed to fetch email {email_id}: {status}")
- logging.warning(f"Failed to fetch email {email_id}: {status}")
- skipped_emails += 1
- continue
- for response_part in msg_data:
- if isinstance(response_part, tuple):
- email_body = response_part[1]
- try:
- msg = email.message_from_bytes(email_body)
- except Exception as e:
- print(f"Error parsing email {email_id}: {e}")
- logging.error(f"Error parsing email {email_id}: {e}")
- skipped_emails += 1
- continue
- subject, encoding = decode_header(msg["Subject"])[0]
- subject = decode_string(subject, encoding)
- date_str = msg.get("Date")
- try:
- date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
- except ValueError:
- date = datetime.now()
- folder_name = clean_filename(f"{date.strftime('%Y-%m-%d_%H-%M-%S')}_{subject[:30]}")
- folder_path = os.path.join(base_dir, folder_name)
- os.makedirs(folder_path, exist_ok=True)
- print(f"Processing: {subject}")
- if save_email_content(msg, folder_path):
- attachments_saved = 0
- for part in msg.walk():
- if part.get_content_maintype() == "multipart":
- continue
- if part.get("Content-Disposition") is None:
- continue
- if save_attachment(part, folder_path):
- attachments_saved += 1
- processed_emails += 1
- print(f"Email processed. Attachments saved: {attachments_saved}")
- else:
- skipped_emails += 1
- print(f"Failed to save email content for: {subject}")
- except Exception as e:
- print(f"Error processing email {email_id}: {e}")
- logging.error(f"Error processing email {email_id}: {e}")
- skipped_emails += 1
- if processed_emails % 10 == 0:
- print(f"Current batch progress: {processed_emails}/{batch_size} emails processed")
- print(f"Overall Progress: {start_index + processed_emails}/{total_emails} emails processed")
- logging.info(f"Current batch progress: {processed_emails}/{batch_size} emails processed")
- logging.info(f"Overall Progress: {start_index + processed_emails}/{total_emails} emails processed")
- last_processed = start_index + processed_emails
- print(f"Batch complete. Processed {processed_emails} emails in this batch.")
- print(f"Total progress: {last_processed}/{total_emails} emails processed")
- logging.info(f"Batch complete. Processed {processed_emails} emails in this batch.")
- logging.info(f"Total progress: {last_processed}/{total_emails} emails processed")
- return total_emails, processed_emails, skipped_emails, last_processed
- def save_progress(progress_file, last_processed, total_emails):
- with open(progress_file, 'w') as f:
- json.dump({
- 'last_processed': last_processed,
- 'total_emails': total_emails
- }, f)
- print(f"Progress saved to: {progress_file}")
- logging.info(f"Progress saved to: {progress_file}")
- def load_progress(progress_file):
- if os.path.exists(progress_file):
- with open(progress_file, 'r') as f:
- data = json.load(f)
- return data.get('last_processed', 0), data.get('total_emails', 0)
- return 0, 0
- def main():
- print("Starting Gmail Email Downloader")
- EMAIL = "[email protected]"
- PASSWORD = getpass.getpass("Enter your Gmail password or app password: ")
- if not EMAIL or not PASSWORD:
- print("Error: Email or password not provided.")
- logging.error("Error: Email or password not provided.")
- sys.exit(1)
- base_dir = "c:\\emails"
- os.makedirs(base_dir, exist_ok=True)
- print(f"Saving emails to: {base_dir}")
- progress_file = os.path.join(base_dir, 'progress.json')
- print(f"Progress file location: {progress_file}")
- logging.info(f"Progress file location: {progress_file}")
- start_index, total_emails_saved = load_progress(progress_file)
- print(f"Resuming from email index: {start_index}")
- logging.info(f"Resuming from email index: {start_index}")
- batch_size = 5000 # Process 5000 emails at a time
- try:
- mail = connect_to_gmail(EMAIL, PASSWORD)
- total_emails, processed_emails, skipped_emails, last_processed = process_emails(mail, EMAIL, PASSWORD, base_dir, start_index, batch_size)
- mail.logout()
- print("Logged out from Gmail.")
- save_progress(progress_file, last_processed, total_emails)
- print("\nBatch processing completed.")
- print(f"Total emails in inbox: {total_emails}")
- print(f"Emails processed so far: {last_processed}")
- print(f"Emails processed in this batch: {processed_emails}")
- print(f"Emails skipped in this batch: {skipped_emails}")
- print(f"Emails remaining: {total_emails - last_processed}")
- if last_processed < total_emails:
- print(f"\nThere are more emails to process. Run the script again to continue.")
- print(f"Overall Progress: {last_processed / total_emails * 100:.2f}% complete")
- else:
- print("\nAll emails have been processed!")
- except Exception as e:
- print(f"An unexpected error occurred: {e}")
- logging.error(f"An unexpected error occurred: {e}")
- print("Script execution completed.")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement