Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import PyPDF2
- import pandas as pd
- # Function to check if a word starts with an uppercase letter
- def starts_with_upper(word):
- return len(word) > 0 and word[0].isupper()
- # Function to split text into speaker and speech
- def extract_speaker_speech(text):
- speaker = ""
- speech = ""
- split_text = text.split(":", 1)
- if len(split_text) > 1 and starts_with_upper(split_text[0].split()[-1]):
- speaker = split_text[0].strip()
- speech = split_text[1].strip()
- return speaker, speech
- # Open the PDF file
- pdf_file = ('C:/Users/voltp/Downloads/2 seja_35-112.pdf')
- # Open the PDF using PyPDF2
- with open(pdf_file, 'rb') as file:
- pdf_reader = PyPDF2.PdfReader(file)
- text = ''
- for page_num in range(len(pdf_reader.pages)):
- page = pdf_reader.pages[page_num]
- text += page.extract_text()
- # Extract speaker and speech content while concatenating speech for the same speaker
- lines = text.split("\n")
- data = {"Speaker": [], "Speech": []}
- current_speaker = ""
- current_speech = ""
- for line in lines:
- speaker, speech = extract_speaker_speech(line)
- if speaker:
- if current_speaker and current_speech:
- data["Speaker"].append(current_speaker)
- data["Speech"].append(current_speech)
- current_speaker = speaker
- current_speech = speech
- elif current_speech: # Concatenate speech content until a new speaker is detected
- current_speech += "\n" + line
- # Append the last speaker and speech content
- if current_speaker and current_speech:
- data["Speaker"].append(current_speaker)
- data["Speech"].append(current_speech)
- # Create a DataFrame from the extracted data
- df = pd.DataFrame(data)
- # Save the DataFrame to Excel
- df.to_excel('parlamentary_debates.xlsx', index=False)
- print("Excel file 'parliament_debates.xlsx' created successfully.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement