Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import pdfkit
- import tkinter as tk
- from tkinter import ttk, filedialog, messagebox
- from pathlib import Path
- from typing import List
- from PyPDF2 import PdfMerger
- from bs4 import BeautifulSoup
- import threading
- class WebPageMergerGUI:
- def __init__(self, root):
- self.root = root
- self.root.title("HTML to PDF Merger")
- self.running = False
- self.current_thread = None
- # Set minimum window size
- self.root.minsize(600, 400)
- # Configure grid
- self.root.grid_columnconfigure(0, weight=1)
- self.root.grid_rowconfigure(2, weight=1)
- self.setup_gui()
- def setup_gui(self):
- # Input directory selection
- input_frame = ttk.Frame(self.root, padding="10")
- input_frame.grid(row=0, column=0, sticky="ew")
- ttk.Label(input_frame, text="Input Directory:").pack(side=tk.LEFT)
- self.input_path = tk.StringVar()
- input_entry = ttk.Entry(input_frame, textvariable=self.input_path)
- input_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=5)
- ttk.Button(input_frame, text="Browse", command=self.browse_input).pack(side=tk.LEFT)
- # Output file selection
- output_frame = ttk.Frame(self.root, padding="10")
- output_frame.grid(row=1, column=0, sticky="ew")
- ttk.Label(output_frame, text="Output PDF:").pack(side=tk.LEFT)
- self.output_path = tk.StringVar()
- output_entry = ttk.Entry(output_frame, textvariable=self.output_path)
- output_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=5)
- ttk.Button(output_frame, text="Browse", command=self.browse_output).pack(side=tk.LEFT)
- # Progress area
- self.log_frame = ttk.Frame(self.root, padding="10")
- self.log_frame.grid(row=2, column=0, sticky="nsew")
- # Create log text widget with scrollbar
- self.log_text = tk.Text(self.log_frame, height=10, wrap=tk.WORD)
- scrollbar = ttk.Scrollbar(self.log_frame, orient="vertical", command=self.log_text.yview)
- self.log_text.configure(yscrollcommand=scrollbar.set)
- self.log_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
- scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
- # Progress bar
- self.progress_var = tk.DoubleVar()
- self.progress_bar = ttk.Progressbar(
- self.root,
- variable=self.progress_var,
- maximum=100
- )
- self.progress_bar.grid(row=3, column=0, sticky="ew", padx=10, pady=5)
- # Buttons frame
- button_frame = ttk.Frame(self.root)
- button_frame.grid(row=4, column=0, pady=10)
- # Convert button
- self.convert_button = ttk.Button(
- button_frame,
- text="Convert to PDF",
- command=self.start_conversion
- )
- self.convert_button.pack(side=tk.LEFT, padx=5)
- # Stop button
- self.stop_button = ttk.Button(
- button_frame,
- text="Stop",
- command=self.stop_conversion,
- state=tk.DISABLED
- )
- self.stop_button.pack(side=tk.LEFT, padx=5)
- def browse_input(self):
- directory = filedialog.askdirectory(title="Select Input Directory")
- if directory:
- self.input_path.set(directory)
- def browse_output(self):
- file_path = filedialog.asksaveasfilename(
- title="Save PDF As",
- defaultextension=".pdf",
- filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
- )
- if file_path:
- self.output_path.set(file_path)
- def log(self, message):
- self.log_text.insert(tk.END, message + "\n")
- self.log_text.see(tk.END)
- self.root.update_idletasks()
- def start_conversion(self):
- if not self.input_path.get() or not self.output_path.get():
- messagebox.showerror("Error", "Please select both input directory and output file location.")
- return
- self.convert_button.configure(state="disabled")
- self.stop_button.configure(state="normal")
- self.running = True
- self.progress_var.set(0)
- self.log_text.delete(1.0, tk.END)
- # Start conversion in a separate thread
- self.current_thread = threading.Thread(target=self.run_conversion)
- self.current_thread.daemon = True
- self.current_thread.start()
- def stop_conversion(self):
- if self.running:
- self.running = False
- self.log("Stopping conversion process...")
- self.stop_button.configure(state="disabled")
- def run_conversion(self):
- try:
- merger = WebPageMerger(
- self.input_path.get(),
- self.output_path.get(),
- self.log,
- self.update_progress,
- lambda: not self.running # Stop check callback
- )
- merger.merge_to_pdf()
- if self.running: # Only show success if not stopped
- self.root.after(0, self.conversion_complete)
- else:
- self.root.after(0, self.conversion_stopped)
- except Exception as e:
- self.root.after(0, lambda: self.conversion_error(str(e)))
- def conversion_complete(self):
- messagebox.showinfo("Success", "PDF conversion completed successfully!")
- self.reset_gui()
- # Open the PDF with default viewer
- try:
- pdf_path = self.output_path.get()
- if os.name == 'nt': # Windows
- os.startfile(pdf_path)
- elif os.name == 'posix': # Linux
- os.system(f'xdg-open "{pdf_path}"')
- else: # macOS
- os.system(f'open "{pdf_path}"')
- except Exception as e:
- messagebox.showerror("Error", f"Could not open PDF: {str(e)}")
- def conversion_stopped(self):
- messagebox.showinfo("Stopped", "PDF conversion was stopped by user.")
- self.reset_gui()
- def conversion_error(self, error_message):
- messagebox.showerror("Error", f"An error occurred during conversion:\n{error_message}")
- self.reset_gui()
- def reset_gui(self):
- self.running = False
- self.convert_button.configure(state="normal")
- self.stop_button.configure(state="disabled")
- def update_progress(self, value):
- self.progress_var.set(value)
- self.root.update_idletasks()
- class WebPageMerger:
- def __init__(self, input_dir: str, output_file: str, log_callback, progress_callback, stop_check):
- self.input_dir = Path(input_dir).resolve()
- self.output_file = Path(output_file).resolve()
- self.temp_dir = Path('temp_html').resolve()
- self.log = log_callback
- self.update_progress = progress_callback
- self.should_stop = stop_check
- # Create temp directory if it doesn't exist
- self.temp_dir.mkdir(exist_ok=True)
- def _get_files(self) -> List[Path]:
- files = sorted([f for f in self.input_dir.glob('*.html')])
- self.log(f"Found {len(files)} HTML files.")
- return files
- def _process_html(self, input_file: Path) -> Path:
- """Process HTML file to make it self-contained with basic styling."""
- try:
- if self.should_stop():
- raise InterruptedError("Conversion stopped by user")
- # Read the original HTML
- with open(input_file, 'r', encoding='utf-8') as f:
- soup = BeautifulSoup(f.read(), 'html.parser')
- # Remove all existing styles and scripts
- for tag in soup.find_all(['style', 'script', 'link']):
- tag.decompose()
- # Add our own styling
- style = soup.new_tag('style')
- style.string = """
- body {
- font-family: Arial, sans-serif;
- line-height: 1.6;
- margin: 40px;
- color: #333;
- max-width: 900px;
- margin: 40px auto;
- }
- h1, h2, h3 {
- color: #000;
- margin-top: 20px;
- margin-bottom: 10px;
- page-break-after: avoid;
- }
- p {
- margin-bottom: 10px;
- }
- table {
- border-collapse: collapse;
- margin: 15px 0;
- page-break-inside: avoid;
- }
- th, td {
- border: 1px solid #ddd;
- padding: 8px;
- text-align: left;
- }
- th {
- background-color: #f5f5f5;
- }
- a {
- color: #0645ad;
- text-decoration: none;
- }
- @media print {
- a {
- color: #000;
- text-decoration: none;
- }
- }
- """
- soup.head.append(style)
- # Create temporary file
- output_file = self.temp_dir / f"processed_{input_file.name}"
- with open(output_file, 'w', encoding='utf-8') as f:
- f.write(str(soup))
- return output_file
- except Exception as e:
- self.log(f"Error processing HTML {input_file.name}: {str(e)}")
- raise
- def _convert_single_file(self, input_file: Path, output_file: Path) -> bool:
- try:
- if self.should_stop():
- raise InterruptedError("Conversion stopped by user")
- # Process the HTML file
- processed_file = self._process_html(input_file)
- # Configure PDF options
- options = {
- 'page-size': 'Letter',
- 'margin-top': '0.75in',
- 'margin-right': '0.75in',
- 'margin-bottom': '0.75in',
- 'margin-left': '0.75in',
- 'encoding': 'UTF-8',
- 'enable-local-file-access': '',
- 'quiet': '',
- 'no-images': '',
- 'disable-external-links': '',
- 'disable-javascript': '',
- 'print-media-type': ''
- }
- # Convert to PDF
- pdfkit.from_file(
- str(processed_file),
- str(output_file),
- options=options
- )
- return output_file.exists() and output_file.stat().st_size > 0
- except Exception as e:
- self.log(f"Error converting {input_file.name}: {str(e)}")
- return False
- def merge_to_pdf(self):
- try:
- input_files = self._get_files()
- if not input_files:
- raise ValueError(f"No HTML files found in {self.input_dir}")
- successful_pdfs = []
- total_files = len(input_files)
- self.log("\nConverting files to PDF...")
- for i, file in enumerate(input_files, 1):
- if self.should_stop():
- raise InterruptedError("Conversion stopped by user")
- self.log(f"\nProcessing: {file.name}")
- output_pdf = self.temp_dir / f"{file.stem}.pdf"
- if self._convert_single_file(file, output_pdf):
- successful_pdfs.append(output_pdf)
- self.log(f"Successfully converted: {file.name}")
- else:
- self.log(f"Failed to convert: {file.name}")
- # Update progress bar
- progress = (i / total_files) * 100
- self.update_progress(progress)
- if not successful_pdfs:
- raise ValueError("No files were successfully converted to PDF")
- if self.should_stop():
- raise InterruptedError("Conversion stopped by user")
- self.log(f"\nMerging {len(successful_pdfs)} PDFs...")
- merger = PdfMerger()
- for pdf in successful_pdfs:
- if pdf.exists() and pdf.stat().st_size > 0:
- merger.append(str(pdf))
- merger.write(str(self.output_file))
- merger.close()
- self.log(f"\nSuccessfully created final PDF: {self.output_file}")
- self.log(f"Total files converted: {len(successful_pdfs)}/{len(input_files)}")
- except InterruptedError:
- self.log("\nConversion process stopped by user.")
- raise
- except Exception as e:
- self.log(f"Error in merge process: {str(e)}")
- raise
- finally:
- self.log("\nCleaning up temporary files...")
- if self.temp_dir.exists():
- for temp_file in self.temp_dir.glob('*'):
- try:
- temp_file.unlink()
- except Exception:
- pass
- try:
- self.temp_dir.rmdir()
- except Exception:
- pass
- def main():
- root = tk.Tk()
- app = WebPageMergerGUI(root)
- root.mainloop()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement