View difference between Paste ID: au4XD6ce and eBmEEFkR
SHOW: | | - or go back to the newest paste.
1
----------------------
2
EXPLANATION:
3
4
ENGLISH: https://neculaifantanaru.com/en/python-delete-all-files-with-less-than-250-characters.html
5
6
ROMANIAN: https://neculaifantanaru.com/python-sterge-toate-fisierele-cu-mai-putin-de-250-de-caractere.html
7
----------------------
8
9
10
import os
11
import re
12
import random
13
import unidecode
14
import nltk
15
from nltk import tokenize
16
# nltk.download('punkt')
17
import requests
18
from usp.tree import sitemap_tree_for_homepage
19
20
def read_text_from_file(file_path):
21
    """
22
    Aceasta functie returneaza continutul unui fisier.
23
    file_path: calea catre fisierul din care vrei sa citesti
24
    """
25
    with open(file_path, encoding='utf8') as f:
26
        text = f.read()
27
        f.close()
28
        return text
29
    
30
31
def write_to_file(text, file_path):
32
    """
33
    Aceasta functie scrie un text intr-un fisier.
34
    text: textul pe care vrei sa il scrii
35
    file_path: calea catre fisierul in care vrei sa scrii
36
    """
37-
        if len(page_html) < 250:
37+
    with open(file_path, 'wb') as f:
38-
            os.remove(filepath)
38+
        f.write(text.encode('utf8', 'ignore'))
39-
            counter_sterse += 1
39+
40-
            continue
40+
41
# 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE)
42
FOLDER_LOCAL = 'd:\\Folder1'
43
44
page_text_pattern = re.compile('<-- START -->([\s\S]*?)<-- FINAL -->')
45
counter_sterse = 0
46
47
for f in os.listdir(FOLDER_LOCAL):
48
    if f.endswith('.html') or f.endswith('.htm'):
49
        filepath = os.path.join(FOLDER_LOCAL, f)
50
        page_html = read_text_from_file(filepath)
51
        page_text = re.findall(page_text_pattern, page_html)
52
        if len(page_text) != 0:
53
            page_text = page_text[0]
54
55
            # print(page_text, len(page_text), filepath)
56
57
            if len(page_text) < 1500:
58
                os.remove(filepath)
59
                counter_sterse += 1
60
                continue
61
62
print("S-au sters {} fisiere".format(counter_sterse))