Advertisement
Guest User

anlpPython

a guest
Oct 12th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.79 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. # Each stage of the process is numbered and printed it so you can see the conversion at each step
  4. # The print method also has a number in the form of a string, so it shows up in the console as documented.
  5. #i.e print( 'Number. ' + printedVariable )
  6.  
  7.  
  8. #imports
  9. import re
  10. import sys
  11. from random import random
  12. from math import log
  13. from collections import defaultdict
  14.  
  15. #0. This is the original string, it is printed in the main body of the code below
  16. input_string = "ASd1234567890 . @$$$$£ rupaulisbae %" # Changed the input text to test the number function
  17.  
  18. #Creates a dictionary for counting the trigrams
  19. tri_counts=defaultdict(int) #counts of all trigrams in input
  20.  
  21. ## Checks that there is an input file to be read
  22. #if len(sys.argv) != 2:
  23. # print("Usage: ", sys.argv[0], "<training_file>")
  24. # sys.exit(1)
  25.  
  26. #infile = sys.argv[1] #get input argument: the training file
  27. ##
  28.  
  29. def preprocess_line(inputText): # Had to change input to a different name, as input is already pre-defined by python
  30. # 1. make it lower case
  31. lower_input = inputText.lower();
  32. print('1. ' + lower_input)
  33.  
  34. # 2. & 3. replace 1-9 -> 0
  35. # Replace 1-9 -> 0
  36. # It's not the most elegant but it does work, and we could refine it once it's all working
  37. # Takes the previous input of the text and outputs a zero in place of the other number
  38. # Cycles through this until it has been completed
  39. # See print(numConversion1) and print(numConversionDone)
  40. numConversion1 = lower_input.replace('1', '0')
  41. print('2. ' + numConversion1)
  42.  
  43. numConversion2 = numConversion1.replace('2', '0')
  44. numConversion3 = numConversion2.replace('3', '0')
  45. numConversion4 = numConversion3.replace('4', '0')
  46. numConversion5 = numConversion4.replace('5', '0')
  47. numConversion6 = numConversion5.replace('6', '0')
  48. numConversion7 = numConversion6.replace('7', '0')
  49. numConversion8 = numConversion7.replace('8', '0')
  50. numConversion9 = numConversion8.replace('9', '0')
  51.  
  52. numConversionDone = numConversion9
  53. print('3. '+ numConversionDone) # - here for demonstration of num conversion
  54.  
  55. #4. Performs a regular expression, finds characters that match a-z, 0, [space] and [.]
  56. # Saves each value as an item in a list (converted to a string below)
  57. letters_numbers = re.findall('([a-z0\.\ ])',numConversionDone)
  58. print('4. '+ str(letters_numbers))
  59. # Converts the above list as a string
  60. letters_numbers = "".join(letters_numbers)
  61.  
  62. outputText = letters_numbers
  63. # an outputText variable is created and returned as the output of our function
  64. return outputText
  65.  
  66.  
  67. #________MAIN BODY OF THE CODE________________________________________________________________
  68. print('0. ' + input_string)
  69.  
  70. # 5. A variable is created with the returned variable (outputText)
  71. # and stored in a new variable preProcessedText
  72. # This is then printed in the console
  73. pre_processed_text = preprocess_line(input_string)
  74. print('5. ' + pre_processed_text)
  75.  
  76. #This bit of code gives an example of how you might extract trigram counts
  77. #from a file, line by line. If you plan to use or modify this code,
  78. #please ensure you understand what it is actually doing, especially at the
  79. #beginning and end of each line. Depending on how you write the rest of
  80. #your program, you may need to modify this code.
  81.  
  82. # with open(infile) as file:
  83. # list_lines = preprocess_file(file) # --------preprocess the file, splitting the file into a list of new lines
  84.  
  85. ##Sorts
  86. test_string_tri_counts = input_string
  87. print('6. ' + input_string)
  88.  
  89. for line in input_string:
  90. line = preprocess_line(input_string) # doesn't do anything yet.
  91. print(line)
  92. for j in range(len(line) - (3)):
  93. trigram = line[j:j + 3]
  94. tri_counts[trigram] += 1
  95. print(tri_counts)
  96.  
  97.  
  98.  
  99. print("Trigram counts in ", test_string_tri_counts, ", sorted alphabetically:")
  100.  
  101.  
  102. # replaced_input = re.sub()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement