Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- #"C:\Users\flo-f\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\Python 3.6 (32-bit).lnk" C:\Users\flo-f\Downloads\delete-phrases-repetition.py "C:\Users\flo-f\Desktop\Corpus ricostruito (FG).txt" 20 "C:\Users\flo-f\Desktop\Corpus nuovo.txt"
- import sys
- import os.path
- wnumber = 10
- searchthis = " " #this is the separator: if " " you look for words, if "." you look for phrases
- def nth_replace(string, old, new, n=1, option='only nth'):
- """
- This function replaces occurrences of string 'old' with string 'new'.
- There are three types of replacement of string 'old':
- 1) 'only nth' replaces only nth occurrence (default).
- 2) 'all left' replaces nth occurrence and all occurrences to the left.
- 3) 'all right' replaces nth occurrence and all occurrences to the right.
- """
- if option == 'only nth':
- left_join = old
- right_join = old
- elif option == 'all left':
- left_join = new
- right_join = old
- elif option == 'all right':
- left_join = old
- right_join = new
- else:
- print("Invalid option. Please choose from: 'only nth' (default), 'all left' or 'all right'")
- return None
- groups = string.split(old)
- nth_split = [left_join.join(groups[:n]), right_join.join(groups[n:])]
- return new.join(nth_split)
- filein = ""
- fileout = ""
- text = ""
- if len(sys.argv) > 1:
- filein = sys.argv[1]
- if filein == "-h":
- print("Usage:\n delete-phrases-repetition.py \"fileinput.txt\" 10 \"fileoutput.txt\" \" \"\n delete-phrases-repetition.py \"fileinput.txt\" 1 \"fileoutput.txt\" \".\"\nNOTE: The number represents occurrences of the separator to get a phrase. If you use \".\" as separator, the number should be 1.")
- sys.exit()
- if len(sys.argv) > 2:
- wnumber = int(sys.argv[2])
- if len(sys.argv) > 3:
- fileout = sys.argv[3]
- if len(sys.argv) > 4:
- searchthis = sys.argv[4]
- if filein != "" and os.path.isfile(filein):
- text_file = open(filein , "r")
- #text = text_file.read().replace("\n", "")
- text = text_file.read()
- text_file.close()
- else:
- text = input("Testo:")
- if wnumber < 0:
- wnumber = 10
- active = 1
- pos = 0
- while active:
- wpos = pos
- npos = pos
- #read a specific number of words
- for i in range(wnumber):
- wpos = text.find(searchthis, npos+1)
- if wpos > 0:
- npos = wpos
- #check if we reached someway the end of text
- if npos > len(text)-1:
- if pos > len(text)-1:
- break
- else:
- npos = len(text)-1
- #read this phrase
- tmpstring = text[pos:npos]
- #replace all occurrences of the phrase, after the first one, with nothing
- if tmpstring != "":
- newtext = nth_replace(text, tmpstring, "", 2, "all right")
- text = newtext
- pos = text.find(searchthis, pos+1)+1 #continue from next word
- if pos <= 0:
- pos = len(text)
- #delete double spaces
- newtext = newtext.replace(" ", " ")
- if fileout:
- text_file = open(fileout, "w")
- text_file.write(newtext)
- text_file.close()
- else:
- print(newtext)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement