Advertisement
Guest User

Split code

a guest
Dec 13th, 2023
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.87 KB | None | 0 0
  1. import PyPDF2
  2. import pandas as pd
  3.  
  4.  
  5. # Function to check if a word starts with an uppercase letter
  6. def starts_with_upper(word):
  7. return len(word) > 0 and word[0].isupper()
  8.  
  9.  
  10. # Function to split text into speaker and speech
  11. def extract_speaker_speech(text):
  12. speaker = ""
  13. speech = ""
  14. split_text = text.split(":", 1)
  15.  
  16. if len(split_text) > 1 and starts_with_upper(split_text[0].split()[-1]):
  17. speaker = split_text[0].strip()
  18. speech = split_text[1].strip()
  19. return speaker, speech
  20.  
  21.  
  22. # Open the PDF file
  23. pdf_file = ('C:/Users/voltp/Downloads/2 seja_35-112.pdf')
  24.  
  25. # Open the PDF using PyPDF2
  26. with open(pdf_file, 'rb') as file:
  27. pdf_reader = PyPDF2.PdfReader(file)
  28. text = ''
  29. for page_num in range(len(pdf_reader.pages)):
  30. page = pdf_reader.pages[page_num]
  31. text += page.extract_text()
  32.  
  33. # Extract speaker and speech content while concatenating speech for the same speaker
  34. lines = text.split("\n")
  35. data = {"Speaker": [], "Speech": []}
  36. current_speaker = ""
  37. current_speech = ""
  38.  
  39. for line in lines:
  40. speaker, speech = extract_speaker_speech(line)
  41. if speaker:
  42. if current_speaker and current_speech:
  43. data["Speaker"].append(current_speaker)
  44. data["Speech"].append(current_speech)
  45.  
  46. current_speaker = speaker
  47. current_speech = speech
  48. elif current_speech: # Concatenate speech content until a new speaker is detected
  49. current_speech += "\n" + line
  50.  
  51. # Append the last speaker and speech content
  52. if current_speaker and current_speech:
  53. data["Speaker"].append(current_speaker)
  54. data["Speech"].append(current_speech)
  55.  
  56. # Create a DataFrame from the extracted data
  57. df = pd.DataFrame(data)
  58.  
  59. # Save the DataFrame to Excel
  60. df.to_excel('parlamentary_debates.xlsx', index=False)
  61.  
  62. print("Excel file 'parliament_debates.xlsx' created successfully.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement