Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- EXPLANATION:
- ROMANIAN: https://neculaifantanaru.com/python-split-all-text-files-from-folder.html
- ENGLISH: https://neculaifantanaru.com/en/python-split-all-text-files-from-folder.html
- -----------------------------
- import sys
- import os
- import nltk
- from nltk import tokenize
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- def imparte_fisiere(cale_fisier_txt, cale_folder_fisiere_impartite):
- text = read_text_from_file(cale_fisier_txt)
- propozitii = tokenize.sent_tokenize(text)
- nume_fisier = os.path.basename(cale_fisier_txt).split('.')[0] # "30.txt" => split('.') => ["30", "txt"] => [0] => "30"
- chunk = ''
- chunk_size = 5000 # 5KB
- chunk_number = 1
- for propozitie in propozitii:
- if len(chunk.encode('utf-8')) < chunk_size:
- chunk = chunk + " " + propozitie
- else:
- # scriere fisier
- cale_fisier_rezultat = cale_folder_fisiere_impartite + '\\' + nume_fisier + "_" + str(chunk_number) + ".txt" # => "30_1.txt"
- write_to_file(chunk, cale_fisier_rezultat)
- # print("Fisierul {} a fost scris cu succes.".format(nume_fisier + "_" + str(chunk_number) + ".txt"))
- chunk = propozitie
- chunk_number += 1
- def creare_fisiere(cale_folder_txt, cale_folder_fisiere_impartite):
- """
- Functia itereaza printr-un folder care contine fisiere txt si imparte in 5KB fiecare fisier
- """
- count = 0
- for f in os.listdir(cale_folder_txt):
- if f.endswith('txt'):
- cale_fisier_txt = cale_folder_txt + "\\" + f
- imparte_fisiere(cale_fisier_txt, cale_folder_fisiere_impartite)
- count += 1
- else:
- continue
- print("Numarul de fisiere modificate: ", count)
- # cale_folder_txt/30.txt => cale_folder_fisiere_impartite/30_part1.txt
- # => cale_folder_fisiere_impartite/30_part2.txt
- def main():
- creare_fisiere("c:\\Folder1", "c:\\Folder1\\fisiere_impartite")
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement