nicuf

Delete All Files With Less Than 250 Characters - part.2

Apr 12th, 2022 (edited)
451
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ----------------------
  2. EXPLANATION:
  3.  
  4. ENGLISH: https://neculaifantanaru.com/en/python-delete-all-files-with-less-than-250-characters.html
  5.  
  6. ROMANIAN: https://neculaifantanaru.com/python-sterge-toate-fisierele-cu-mai-putin-de-250-de-caractere.html
  7. ----------------------
  8.  
  9.  
  10. import os
  11. import re
  12. import random
  13. import unidecode
  14. import nltk
  15. from nltk import tokenize
  16. # nltk.download('punkt')
  17. import requests
  18. from usp.tree import sitemap_tree_for_homepage
  19.  
  20. def read_text_from_file(file_path):
  21.     """
  22.    Aceasta functie returneaza continutul unui fisier.
  23.    file_path: calea catre fisierul din care vrei sa citesti
  24.    """
  25.     with open(file_path, encoding='utf8') as f:
  26.         text = f.read()
  27.         f.close()
  28.         return text
  29.    
  30.  
  31. def write_to_file(text, file_path):
  32.     """
  33.    Aceasta functie scrie un text intr-un fisier.
  34.    text: textul pe care vrei sa il scrii
  35.    file_path: calea catre fisierul in care vrei sa scrii
  36.    """
  37.     with open(file_path, 'wb') as f:
  38.         f.write(text.encode('utf8', 'ignore'))
  39.         f.close()
  40.  
  41. # 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE)
  42. FOLDER_LOCAL = 'd:\\Folder1'
  43.  
  44. page_text_pattern = re.compile('<-- START -->([\s\S]*?)<-- FINAL -->')
  45. counter_sterse = 0
  46.  
  47. for f in os.listdir(FOLDER_LOCAL):
  48.     if f.endswith('.html') or f.endswith('.htm'):
  49.         filepath = os.path.join(FOLDER_LOCAL, f)
  50.         page_html = read_text_from_file(filepath)
  51.         page_text = re.findall(page_text_pattern, page_html)
  52.         if len(page_text) != 0:
  53.             page_text = page_text[0]
  54.  
  55.             # print(page_text, len(page_text), filepath)
  56.  
  57.             if len(page_text) < 1500:
  58.                 os.remove(filepath)
  59.                 counter_sterse += 1
  60.                 continue
  61.  
  62. print("S-au sters {} fisiere".format(counter_sterse))
Add Comment
Please, Sign In to add comment