Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -----------------------------
- EXPLANATION:
- ENGLISH: https://neculaifantanaru.com/en/python-split-the-text-by-a-certain-number-of-characters-in-an-html-paragraph.html
- ROMANIAN: https://neculaifantanaru.com/python-imparte-textul-dupa-un-anumit-numar-de-caractere-intr-un-paragraf-html.html
- -----------------------------
- import requests
- import re
- import os
- import nltk
- from nltk import tokenize
- cale_folder_html = r"e:Test"
- extension_file = ".html"
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- print('Going through folder')
- amount = 0
- for filename in os.listdir(cale_folder_html):
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- cale_fisier_html = cale_folder_html + "\\" + filename
- html_text = read_text_from_file(cale_fisier_html)
- articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?')
- articol_text = re.findall(articol_pattern, html_text)
- if len(articol_text) > 0:
- articol_text = articol_text[0]
- p_pattern = re.compile('<p class="text_obisnuit">(.*?)</p>')
- paragrafe = re.findall(p_pattern, articol_text)
- # aici impartim fiecare paragraf in paragrafe mai mici daca depasesc o anumita limita de caractere
- paragrafe_split = list()
- pattern_paragraf_nou = '<p class="text_obisnuit">{}</p>'
- limita_caractere = 250
- for p in paragrafe:
- if (len(p) > 250 and len(p) < 300) or (len(p) < 200):
- paragrafe_split.append(pattern_paragraf_nou.format(p))
- else:
- propozitii = tokenize.sent_tokenize(p)
- propozitii = [prop.strip().capitalize() for prop in propozitii]
- propozitii = [prop[:-1].strip() + prop[-1] for prop in propozitii]
- paragraf = ''
- for propozitie in propozitii:
- if len(paragraf) + len(propozitie) < limita_caractere:
- if paragraf == '':
- paragraf = paragraf + propozitie
- else:
- paragraf = paragraf + ' ' + propozitie
- else:
- paragrafe_split.append(pattern_paragraf_nou.format(paragraf))
- paragraf = ''
- if (len(paragraf) < limita_caractere):
- paragrafe_split.append(pattern_paragraf_nou.format(paragraf))
- if len(paragrafe) == len(paragrafe_split):
- continue
- else:
- # construim textul dintre ARTICOL START/FINAL folosind paragrafele noi
- articol_start_final = '<!-- ARTICOL START -->\n{}\n<!-- ARTICOL FINAL -->'
- text = '\n'.join(paragrafe_split)
- # inlocuim ce era intre ARTICOL START/FINAL cu noul text
- html_text = re.sub(r'<!-- ARTICOL START -->[\s\S]*?<!-- ARTICOL FINAL -->', articol_start_final.format(text), html_text)
- write_to_file(html_text, cale_fisier_html)
- print("Am modificat: {}".format(filename))
- amount += 1
- print("Am modificat {} fisiere.".format(amount))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement