Split code

import PyPDF2
import pandas as pd


# Function to check if a word starts with an uppercase letter
def starts_with_upper(word):
    return len(word) > 0 and word[0].isupper()


# Function to split text into speaker and speech
def extract_speaker_speech(text):
    speaker = ""
    speech = ""
    split_text = text.split(":", 1)

    if len(split_text) > 1 and starts_with_upper(split_text[0].split()[-1]):
        speaker = split_text[0].strip()
        speech = split_text[1].strip()
    return speaker, speech


# Open the PDF file
pdf_file = ('C:/Users/voltp/Downloads/2 seja_35-112.pdf')

# Open the PDF using PyPDF2
with open(pdf_file, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

# Extract speaker and speech content while concatenating speech for the same speaker
lines = text.split("\n")
data = {"Speaker": [], "Speech": []}
current_speaker = ""
current_speech = ""

for line in lines:
    speaker, speech = extract_speaker_speech(line)
    if speaker:
        if current_speaker and current_speech:
            data["Speaker"].append(current_speaker)
            data["Speech"].append(current_speech)

        current_speaker = speaker
        current_speech = speech
    elif current_speech:  # Concatenate speech content until a new speaker is detected
        current_speech += "\n" + line

# Append the last speaker and speech content
if current_speaker and current_speech:
    data["Speaker"].append(current_speaker)
    data["Speech"].append(current_speech)

# Create a DataFrame from the extracted data
df = pd.DataFrame(data)

# Save the DataFrame to Excel
df.to_excel('parlamentary_debates.xlsx', index=False)

print("Excel file 'parliament_debates.xlsx' created successfully.")