Advertisement
makispaiktis

Frequency appearance of letters

Jul 18th, 2023
657
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.56 KB | None | 0 0
  1. # 1. Execute a simple GET HTTP request using 'requests' library
  2.  
  3. import requests
  4. from bs4 import BeautifulSoup
  5.  
  6. url = 'https://www.bbc.com/news/business-66217641'
  7. # Send a GET request to the website
  8. response = requests.get(url)
  9.  
  10.  
  11.  
  12. # 2. Create a .txt file that contains all the words
  13.  
  14. # Check if the request was successful
  15. if response.status_code == 200:
  16.  
  17.     # Create a BeautifulSoup object to parse the HTML content
  18.     soup = BeautifulSoup(response.content, 'html.parser')
  19.     # Find all the text elements on the website (e.g., paragraphs, headers, etc.)
  20.     text_elements = soup.find_all(text=True)
  21.     # Filter out empty and whitespace-only elements, and exclude script tags
  22.     sentences = [text.strip() for text in text_elements if text.strip() and text.parent.name != 'script']
  23.     # Join the sentences with newline characters
  24.     text_content = '\n'.join(sentences)
  25.     # Save the text content to a .txt file
  26.     with open('info.txt', 'w', encoding='utf-8') as file:
  27.         file.write(text_content)
  28.     print("Text copied from the website and saved as 'info.txt' successfully.")
  29.  
  30. else:
  31.  
  32.     print("Failed to retrieve content from the website.")
  33.  
  34.  
  35.  
  36. # 3. Open the recently-created file named 'info.txt'
  37.  
  38. file_path = './info.txt'
  39.  
  40. with open(file_path, 'r', encoding='utf-8') as file:
  41.     file_contents = file.read()
  42.  
  43.  
  44.  
  45. # 4. Split into lines - Delete the first 6 lines
  46.  
  47. lines = file_contents.splitlines()
  48. DELETED = 6
  49. for _ in range(DELETED):
  50.     lines.pop(0)
  51. print("First 15 rows now:", lines[0:15], sep='\n')
  52.  
  53.  
  54.  
  55. # 5. Delete the word 'BBC' from every line
  56.  
  57. bad = 'BBC'
  58. for index, line in enumerate(lines):
  59.     modified = line.replace(bad, "")
  60.     lines[index] = modified
  61. print("First 15 rows now:", lines[0:15], sep='\n', end='\n\n')
  62. print("Until now, we have a list with all the text lines in the website. We are only interested in the letters,")
  63. print("so we can join all the lines together into a single one!")
  64.  
  65.  
  66.  
  67. # 6. Join all the lines (strings) into a new string
  68.  
  69. sep = ''
  70. sentences = sep.join(lines)
  71. print("Last 100 characters --->", sentences[-100:])
  72.  
  73.  
  74.  
  75. # 7. Keep only the letters and lowercase them
  76.  
  77. alphabet = "abcdefghijklmnopqrstuvwxyz"
  78. sentences = sentences.lower()
  79. sentences = [char for char in sentences if char in alphabet]
  80. print("Last 100 characters --->", sentences[-100:])
  81.  
  82.  
  83.  
  84. # 8. Create a dictionary with all the unique letters and their frequency appearance
  85.  
  86. unique = {char : sentences.count(char) for char in set(sentences)}
  87. values = list(unique.values())
  88. SUM = sum(values)
  89. print("Scanned a text with {} letters:".format(SUM), end='\n\n')
  90. print(unique)
  91. sorted_unique = sorted(unique.items())
  92. print(sorted_unique)
  93.  
  94.  
  95.  
  96. # 9. Plot the frequency appearance
  97.  
  98. import numpy as np
  99. import matplotlib.pyplot as plt
  100.  
  101. freq_app = {}
  102. for tup in sorted_unique:
  103.     letter = tup[0]
  104.     app = tup[1]
  105.     freq_app[letter] = round(100 * app / SUM, 2)
  106. print(freq_app, end='\n\n')
  107.  
  108. keys = list(freq_app.keys())
  109. values = list(freq_app.values())
  110. LEN = len(freq_app)
  111.  
  112. plt.bar(keys, values)
  113. plt.title("Frequency appearance of letters")
  114. plt.ylabel("Frequency appearance (%)")
  115. plt.show()
  116.  
  117.  
  118.  
  119. # 10. Sort by values (descending order)
  120.  
  121. app_desc = sorted(freq_app.items(), key=lambda item : item[1], reverse=True)
  122. print(app_desc, end='\n\n')
  123. app_desc = dict(app_desc)
  124. new_keys = list(app_desc.keys())
  125. new_values = list(app_desc.values())
  126.  
  127. xvalues = np.arange(0, LEN, 1)
  128. plt.bar(xvalues, new_values)
  129. plt.xticks(xvalues, new_keys)
  130. plt.title("Frequency appearance of letters")
  131. plt.ylabel("Frequency appearance (%)")
  132. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement