SHARE
TWEET

delete-phrases-repetition.py

TringaliLuca Mar 29th, 2018 (edited) 85 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python3
  2. #"C:\Users\flo-f\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\Python 3.6 (32-bit).lnk"   C:\Users\flo-f\Downloads\delete-phrases-repetition.py "C:\Users\flo-f\Desktop\Corpus ricostruito (FG).txt" 20 "C:\Users\flo-f\Desktop\Corpus nuovo.txt"
  3. import sys
  4. import os.path
  5.  
  6. wnumber = 10
  7. searchthis = " " #this is the separator: if " " you look for words, if "." you look for phrases
  8.  
  9. def nth_replace(string, old, new, n=1, option='only nth'):
  10.     """
  11.    This function replaces occurrences of string 'old' with string 'new'.
  12.    There are three types of replacement of string 'old':
  13.    1) 'only nth' replaces only nth occurrence (default).
  14.    2) 'all left' replaces nth occurrence and all occurrences to the left.
  15.    3) 'all right' replaces nth occurrence and all occurrences to the right.
  16.    """
  17.     if option == 'only nth':
  18.         left_join = old
  19.         right_join = old
  20.     elif option == 'all left':
  21.         left_join = new
  22.         right_join = old
  23.     elif option == 'all right':
  24.         left_join = old
  25.         right_join = new
  26.     else:
  27.         print("Invalid option. Please choose from: 'only nth' (default), 'all left' or 'all right'")
  28.         return None
  29.     groups = string.split(old)
  30.     nth_split = [left_join.join(groups[:n]), right_join.join(groups[n:])]
  31.     return new.join(nth_split)
  32.  
  33. filein = ""
  34. fileout = ""
  35. text = ""
  36.  
  37. if len(sys.argv) > 1:
  38.     filein = sys.argv[1]
  39. if filein == "-h":
  40.     print("Usage:\n delete-phrases-repetition.py \"fileinput.txt\" 10 \"fileoutput.txt\" \" \"\n delete-phrases-repetition.py \"fileinput.txt\" 1 \"fileoutput.txt\" \".\"\nNOTE: The number represents occurrences of the separator to get a phrase. If you use \".\" as separator, the number should be 1.")
  41.     sys.exit()
  42.  
  43. if len(sys.argv) > 2:
  44.    wnumber = int(sys.argv[2])
  45.  
  46. if len(sys.argv) > 3:
  47.     fileout = sys.argv[3]
  48.  
  49. if len(sys.argv) > 4:
  50.     searchthis = sys.argv[4]
  51.  
  52. if filein != "" and os.path.isfile(filein):
  53.     text_file = open(filein , "r")
  54.     #text = text_file.read().replace("\n", "")
  55.     text = text_file.read()
  56.     text_file.close()
  57. else:
  58.     text = input("Testo:")
  59.  
  60. if wnumber < 0:
  61.     wnumber = 10
  62.  
  63.  
  64. active = 1
  65. pos = 0
  66. while active:
  67.     wpos = pos
  68.     npos = pos
  69.     #read a specific number of words
  70.     for i in range(wnumber):
  71.         wpos = text.find(searchthis, npos+1)
  72.         if wpos > 0:
  73.             npos = wpos
  74.     #check if we reached someway the end of text
  75.     if npos > len(text)-1:
  76.         if pos > len(text)-1:
  77.             break
  78.         else:
  79.             npos = len(text)-1
  80.     #read this phrase
  81.     tmpstring = text[pos:npos]
  82.     #replace all occurrences of the phrase, after the first one, with nothing
  83.     if tmpstring != "":
  84.         newtext = nth_replace(text, tmpstring, "", 2, "all right")
  85.         text = newtext
  86.     pos = text.find(searchthis, pos+1)+1 #continue from next word
  87.     if pos <= 0:
  88.         pos = len(text)
  89.  
  90. #delete double spaces
  91. newtext = newtext.replace("  ", " ")
  92.  
  93. if fileout:
  94.     text_file = open(fileout, "w")
  95.     text_file.write(newtext)
  96.     text_file.close()
  97. else:
  98.     print(newtext)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top