SHOW:
|
|
- or go back to the newest paste.
1 | ---------------------- | |
2 | EXPLANATION: | |
3 | ||
4 | ENGLISH: https://neculaifantanaru.com/en/python-delete-all-files-with-less-than-250-characters.html | |
5 | ||
6 | ROMANIAN: https://neculaifantanaru.com/python-sterge-toate-fisierele-cu-mai-putin-de-250-de-caractere.html | |
7 | ---------------------- | |
8 | ||
9 | ||
10 | import os | |
11 | import re | |
12 | import random | |
13 | import unidecode | |
14 | import nltk | |
15 | from nltk import tokenize | |
16 | # nltk.download('punkt') | |
17 | import requests | |
18 | from usp.tree import sitemap_tree_for_homepage | |
19 | ||
20 | def read_text_from_file(file_path): | |
21 | """ | |
22 | Aceasta functie returneaza continutul unui fisier. | |
23 | file_path: calea catre fisierul din care vrei sa citesti | |
24 | """ | |
25 | with open(file_path, encoding='utf8') as f: | |
26 | text = f.read() | |
27 | f.close() | |
28 | return text | |
29 | ||
30 | ||
31 | def write_to_file(text, file_path): | |
32 | """ | |
33 | Aceasta functie scrie un text intr-un fisier. | |
34 | text: textul pe care vrei sa il scrii | |
35 | file_path: calea catre fisierul in care vrei sa scrii | |
36 | """ | |
37 | - | if len(page_html) < 250: |
37 | + | with open(file_path, 'wb') as f: |
38 | - | os.remove(filepath) |
38 | + | f.write(text.encode('utf8', 'ignore')) |
39 | - | counter_sterse += 1 |
39 | + | |
40 | - | continue |
40 | + | |
41 | # 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE) | |
42 | FOLDER_LOCAL = 'd:\\Folder1' | |
43 | ||
44 | page_text_pattern = re.compile('<-- START -->([\s\S]*?)<-- FINAL -->') | |
45 | counter_sterse = 0 | |
46 | ||
47 | for f in os.listdir(FOLDER_LOCAL): | |
48 | if f.endswith('.html') or f.endswith('.htm'): | |
49 | filepath = os.path.join(FOLDER_LOCAL, f) | |
50 | page_html = read_text_from_file(filepath) | |
51 | page_text = re.findall(page_text_pattern, page_html) | |
52 | if len(page_text) != 0: | |
53 | page_text = page_text[0] | |
54 | ||
55 | # print(page_text, len(page_text), filepath) | |
56 | ||
57 | if len(page_text) < 1500: | |
58 | os.remove(filepath) | |
59 | counter_sterse += 1 | |
60 | continue | |
61 | ||
62 | print("S-au sters {} fisiere".format(counter_sterse)) |