daily pastebin goal
16%
SHARE
TWEET

anlpPython

a guest Oct 12th, 2017 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2.  
  3. # Each stage of the process is numbered and printed it so you can see the conversion at each step
  4. # The print method also has a number in the form of a string, so it shows up in the console as documented.
  5. #i.e print( 'Number. ' + printedVariable )
  6.  
  7.  
  8. #imports
  9. import re
  10. import sys
  11. from random import random
  12. from math import log
  13. from collections import defaultdict
  14.  
  15. #0. This is the original string, it is printed in the main body of the code below
  16. input_string = "ASd1234567890 . @$$$$£ rupaulisbae  %" # Changed the input text to test the number function
  17.  
  18. #Creates a dictionary for counting the trigrams
  19. tri_counts=defaultdict(int) #counts of all trigrams in input
  20.  
  21. ## Checks that there is an input file to be read
  22. #if len(sys.argv) != 2:
  23. #    print("Usage: ", sys.argv[0], "<training_file>")
  24. #    sys.exit(1)
  25.  
  26. #infile = sys.argv[1] #get input argument: the training file
  27. ##
  28.  
  29. def preprocess_line(inputText): # Had to change input to a different name, as input is already pre-defined by python
  30.     # 1. make it lower case
  31.     lower_input = inputText.lower();
  32.     print('1. ' + lower_input)
  33.  
  34.     # 2. & 3. replace 1-9 -> 0
  35.     # Replace 1-9 -> 0
  36.         # It's not the most elegant but it does work, and we could refine it once it's all working
  37.         # Takes the previous input of the text and outputs a zero in place of the other number
  38.         # Cycles through this until it has been completed
  39.             # See print(numConversion1) and print(numConversionDone)
  40.     numConversion1 = lower_input.replace('1', '0')
  41.     print('2. ' + numConversion1)
  42.  
  43.     numConversion2 = numConversion1.replace('2', '0')
  44.     numConversion3 = numConversion2.replace('3', '0')
  45.     numConversion4 = numConversion3.replace('4', '0')
  46.     numConversion5 = numConversion4.replace('5', '0')
  47.     numConversion6 = numConversion5.replace('6', '0')
  48.     numConversion7 = numConversion6.replace('7', '0')
  49.     numConversion8 = numConversion7.replace('8', '0')
  50.     numConversion9 = numConversion8.replace('9', '0')
  51.  
  52.     numConversionDone = numConversion9
  53.     print('3. '+ numConversionDone) #    - here for demonstration of num conversion
  54.  
  55.     #4. Performs a regular expression, finds characters that match a-z, 0, [space] and [.]
  56.     #   Saves each value as an item in a list (converted to a string below)
  57.     letters_numbers = re.findall('([a-z0\.\ ])',numConversionDone)
  58.     print('4. '+ str(letters_numbers))
  59.     #   Converts the above list as a string
  60.     letters_numbers = "".join(letters_numbers)
  61.  
  62.     outputText = letters_numbers
  63.     # an outputText variable is created and returned as the output of our function
  64.     return outputText
  65.  
  66.  
  67. #________MAIN BODY OF THE CODE________________________________________________________________
  68. print('0. ' + input_string)
  69.  
  70. # 5. A variable is created with the returned variable (outputText)
  71. #    and stored in a new variable preProcessedText
  72. #    This is then printed in  the console
  73. pre_processed_text = preprocess_line(input_string)
  74. print('5. ' + pre_processed_text)
  75.  
  76. #This bit of code gives an example of how you might extract trigram counts
  77. #from a file, line by line. If you plan to use or modify this code,
  78. #please ensure you understand what it is actually doing, especially at the
  79. #beginning and end of each line. Depending on how you write the rest of
  80. #your program, you may need to modify this code.
  81.  
  82. # with open(infile) as file:
  83. #    list_lines = preprocess_file(file)  # --------preprocess the file, splitting the file into a list of new lines
  84.  
  85. ##Sorts
  86. test_string_tri_counts = input_string
  87. print('6. ' + input_string)
  88.  
  89. for line in input_string:
  90.     line = preprocess_line(input_string)  # doesn't do anything yet.
  91.     print(line)
  92.     for j in range(len(line) - (3)):
  93.         trigram = line[j:j + 3]
  94.         tri_counts[trigram] += 1
  95. print(tri_counts)
  96.  
  97.  
  98.  
  99. print("Trigram counts in ", test_string_tri_counts, ", sorted alphabetically:")
  100.  
  101.  
  102. # replaced_input = re.sub()
RAW Paste Data
Top