Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ----------------------
- EXPLANATION:
- ENGLISH: https://neculaifantanaru.com/en/python-delete-all-files-with-less-than-250-characters.html
- ROMANIAN: https://neculaifantanaru.com/python-sterge-toate-fisierele-cu-mai-putin-de-250-de-caractere.html
- ----------------------
- import os
- import re
- import random
- import unidecode
- import nltk
- from nltk import tokenize
- # nltk.download('punkt')
- import requests
- from usp.tree import sitemap_tree_for_homepage
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- f.close()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- f.close()
- # 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE)
- FOLDER_LOCAL = 'd:\\Folder1'
- page_text_pattern = re.compile('<-- START -->([\s\S]*?)<-- FINAL -->')
- counter_sterse = 0
- for f in os.listdir(FOLDER_LOCAL):
- if f.endswith('.html') or f.endswith('.htm'):
- filepath = os.path.join(FOLDER_LOCAL, f)
- page_html = read_text_from_file(filepath)
- page_text = re.findall(page_text_pattern, page_html)
- if len(page_text) != 0:
- page_text = page_text[0]
- # print(page_text, len(page_text), filepath)
- if len(page_text) < 1500:
- os.remove(filepath)
- counter_sterse += 1
- continue
- print("S-au sters {} fisiere".format(counter_sterse))
Add Comment
Please, Sign In to add comment