Guest User

Untitled

a guest
Nov 24th, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.72 KB | None | 0 0
  1. # encoding: utf-8
  2.  
  3. """
  4. Module for testing code on outlier detection.
  5. """
  6.  
  7. from collections import defaultdict
  8. import math
  9. import sys
  10. import csv
  11.  
  12. # set the start-of-sequence and end-of-sequence boundary
  13. # char for internal computation; this should ideally be
  14. # a character not used in the sequences themselves
  15. _BOUNDARY = '#'
  16.  
  17. def build_model(seqs, context_len=3):
  18. """
  19. Build a sequence (word) model and return it in a data structure.
  20.  
  21. Args:
  22. seqs (list): The list of sequences upon which the modelling
  23. is performed. Sequences can be either strings
  24. or lists/tuples (types can be mixed).
  25.  
  26. context_len (int): The length of the largest context for
  27. which probabilities are collected
  28. (whenever possible). Defaults to 3.
  29.  
  30. Returns:
  31. dict: a dictionary with the sequence model.
  32.  
  33. """
  34.  
  35. # initialize variables used for probability collection, as
  36. # follows:
  37. # - `counts_char` is a dictionary of dictionaries of integers
  38. # with all the observed context -> transition counts
  39. # - `counts_seqlen` is a dictionary of integers with all
  40. # the observed sequence lengths
  41. # - `logprobs_char` is a dictionary of dictionaries of floats
  42. # with the log-probabilities for all the observed
  43. # context -> transitions
  44. # - `logprobs_seqlen` is a dictionary of floats with
  45. # the log-probabilities for all the observed
  46. # context -> transitions
  47. counts_char = defaultdict(lambda: defaultdict(int))
  48. counts_seqlen = defaultdict(int)
  49. logprobs_char = {}
  50. logprobs_seqlen = {}
  51.  
  52. # join lists/tuples into strings, if needed, and then
  53. # add start-of-sequence and end-of-sequence boundaries
  54. seqs = [''.join(seq) if isinstance(seq, (list, tuple)) else seq for seq in seqs]
  55. seqs = ['%s%s%s' % (_BOUNDARY, seq, _BOUNDARY) for seq in seqs]
  56.  
  57. # iterate over all sequences
  58. for seq in seqs:
  59. # collect sequence length information (please note that
  60. # this intentionally includes sequences boundaries, so
  61. # that the minimum length should be 3); we also cache the
  62. # length of the current sequence, which will be used many times
  63. # in the loop
  64. len_seq = len(seq)
  65. counts_seqlen[len_seq] += 1
  66.  
  67. # iterate over all context lengths up to the maximum one
  68. for clength in range(1, context_len+1):
  69. # collect all subsequences of the current sequence of
  70. # length `clength`, excluding the first one (it starts at
  71. # index one) as we are not interested in the
  72. # NULL -> START_BOUNDARY transition common to all sequences;
  73. # plese note that `context` will be an empty string for
  74. # context of length 1 (which is the intended behaviour)
  75. for idx in range(len_seq-clength+1):
  76. context = seq[idx:idx+clength-1]
  77. transition = seq[idx+clength-1]
  78. counts_char[context][transition] += 1
  79.  
  80. # calculate the log-probability for all transitions in terms of their
  81. # context; we cache the denominator as a float to speed the computation
  82. for context in counts_char:
  83. total = float(sum(counts_char[context].values()))
  84. logprobs_char[context] = {context:math.log(count/total)
  85. for context, count
  86. in counts_char[context].items()}
  87.  
  88. # calculate the log-probability for all sequences lengths; we cache the
  89. # denominator as a float to speed the computation
  90. total = float(sum(counts_seqlen.values()))
  91. logprobs_seqlen = {seqlen:math.log(count/total) for seqlen, count in counts_seqlen.items()}
  92.  
  93. return {'char': logprobs_char, 'seqlen':logprobs_seqlen}
  94.  
  95. def seq_logprob(seq, model, use_length_prob=True, length_normalize=True):
  96. """
  97. Calculate the log-probability of a sequence according to a model.
  98.  
  99. Args:
  100. seq (str or list): The sequence whose probability will be calculated.
  101. model (dict): A sequence probability model, as returned by
  102. build_model().
  103. use_length_prob (bool): Whether to add a probability based on
  104. sequence length. Defaults to True.
  105. length_normalize (bool): Whether to normalize end probabilities
  106. dividing by sequence length. Defaults to True.
  107.  
  108. Return:
  109. float : The log-probability of the sequence (or None, if the
  110. sequence is shorter than the sequence length context
  111. the model was built upon).
  112. """
  113.  
  114.  
  115. # join list/tuple into a string, if needed, and then
  116. # add start-of-sequence and end-of-sequence boundaries
  117. if isinstance(seq, (list, tuple)):
  118. seq = ''.join(seq)
  119. seq = '%s%s%s' % (_BOUNDARY, seq, _BOUNDARY)
  120.  
  121. # extract the length of the largest context in the model, i.e.,
  122. # the parameter that was used with `build_model()`; the +1
  123. # is due to the need of including the transition
  124. context_len = max(map(len, model['char'])) + 1
  125.  
  126. # collect transition probabilities, iterating over all context
  127. # length up to the maximum one; we also cache the current
  128. # sequence length to speed computation
  129. seq_prob = 0.0
  130. len_seq = len(seq)
  131. for clength in range(1, context_len+1):
  132. for idx in range(len_seq-clength+1):
  133. context = seq[idx:idx+clength-1]
  134. transition = seq[idx+clength-1]
  135. seq_prob += model['char'][context][transition]
  136.  
  137. # add the probability for sequence length (including boundaries), if requested
  138. if use_length_prob:
  139. seq_prob += model['seqlen'][len_seq]
  140.  
  141. # normalize probabilities based on sequence length (including boundaries), if requested
  142. if length_normalize:
  143. seq_prob /= len_seq
  144.  
  145. return seq_prob
  146.  
  147.  
  148. def test():
  149. """
  150. Main test function.
  151. """
  152.  
  153. # read data
  154. with open('/usr/share/dict/words') as handler:
  155. lines = handler.readlines()
  156. words = [line[:-1] for line in lines]
  157.  
  158. # add obviously irregular entry
  159. words.append('+[@$@]xç{jk=~=qwertyuiop')
  160.  
  161. words.append('mytest')
  162. words.append(['m', 'y', 't', 'e', 's', 't'])
  163.  
  164. model = build_model(words)
  165.  
  166. # collect probabilities and print them ordered
  167. probs = {}
  168. for word in words:
  169. logp = seq_logprob(word, model)
  170. if logp:
  171. if isinstance(word, list):
  172. probs[tuple(word)] = logp
  173. else:
  174. probs[word] = logp
  175.  
  176. sorted_probs = sorted(probs.items(), key=lambda x: x[1])
  177. for entry in sorted_probs:
  178. print(entry)
  179.  
  180. def run_check(filename, column, tsv):
  181. # set delimiter
  182. if tsv:
  183. delimiter = '\t'
  184. else:
  185. delimiter = ','
  186.  
  187. # collect all entries
  188. seqs = []
  189. with open(filename) as csvfile:
  190. reader = csv.DictReader(csvfile, delimiter=delimiter)
  191. for row in reader:
  192. seqs.append(row[column])
  193.  
  194. # build model
  195. model = build_model(seqs)
  196.  
  197. # collect probabilities and print them ordered
  198. probs = {}
  199. for seq in seqs:
  200. logp = seq_logprob(seq, model)
  201. if logp:
  202. probs[seq] = logp
  203. else:
  204. probs[seq] = -99.999
  205.  
  206. sorted_probs = sorted(probs.items(), key=lambda x: x[1])
  207. for entry in sorted_probs:
  208. print(entry)
  209.  
  210. if __name__ == '__main__':
  211. # extremely simple argument handling
  212. import argparse
  213.  
  214. parser = argparse.ArgumentParser('check data')
  215. parser.add_argument('-t', '--tsv',
  216. help='use tabs as delimiters',
  217. action='store_true')
  218. parser.add_argument('filename', metavar='filename', help='file to check')
  219. parser.add_argument('column', metavar='column', help='name of the column to check')
  220. arguments = parser.parse_args()
  221.  
  222. run_check(arguments.filename, arguments.column, arguments.tsv)
  223.  
  224. #test()
Add Comment
Please, Sign In to add comment