Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Apr 28th, 2012  |  syntax: None  |  size: 4.58 KB  |  hits: 17  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. Regex searching using list elements to find matches in large document
  2. Traceback (most recent call last):
  3.   File "/Users/laurelhochstetler/scripts/identify_SNPs.py", line 57, in <module>
  4.     if re.match(item,"(.*)", Line):
  5.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 137, in match
  6.     return _compile(pattern, flags).match(string)
  7.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 242, in _compile
  8.     p = sre_compile.compile(pattern, flags)
  9.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_compile.py", line 500, in compile
  10.     p = sre_parse.parse(p, flags)
  11.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 673, in parse
  12.     p = _parse_sub(source, pattern, 0)
  13.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 308, in _parse_sub
  14.     itemsappend(_parse(source, state))
  15.   File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 401, in _parse
  16.     if state.flags & SRE_FLAG_VERBOSE:
  17. TypeError: unsupported operand type(s) for &: 'str' and 'int'
  18.        
  19. #!/usr/bin/env python
  20. import re #this imports regular expression module
  21. import collections
  22.  
  23. MomGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Mary_Maloney_Full_20110514145353.txt', 'r')
  24. LaurelGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Laurel_Hochstetler_Full_20100411230740.txt', 'r')
  25. LineNumber = 0
  26. momSNP = []
  27. LaurelSNP = []
  28. f = open("mom_edit.txt","w")
  29. for Line in MomGenome:
  30.     if LineNumber > 0:
  31.         Line=Line.strip('n')
  32.         ElementList=Line.split('t')
  33.  
  34.         momSNP.append(ElementList[0])
  35.  
  36.         LineNumber = LineNumber + 1
  37. MomGenome.close()
  38. for Line in LaurelGenome:
  39.     if LineNumber > 0:
  40.         Line=Line.strip('n')
  41.         ElementList=Line.split('t')
  42.  
  43.         LaurelSNP.append(ElementList[0])
  44.  
  45.         LineNumber = LineNumber + 1
  46. momSNP_multiset = collections.Counter(momSNP)            
  47. LaurelSNP_multiset = collections.Counter(LaurelSNP)
  48. overlap = list((momSNP_multiset and LaurelSNP_multiset).elements())
  49. momSNP_left = list((momSNP_multiset - LaurelSNP_multiset).elements())
  50. LaurelSNP_left = list((LaurelSNP_multiset - momSNP_multiset).elements())
  51. LaurelGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Laurel_Hochstetler_Full_20100411230740.txt', 'r')
  52. i = 0
  53. for Line in LaurelGenome:
  54.     for item in LaurelSNP_left:
  55.             if i < 1961:
  56.                 if re.match(item, Line):
  57.                     pass
  58.  
  59.                 else:
  60.                     print Line
  61.  
  62.             i = i + 1
  63.     LineNumber = LineNumber + 1
  64.        
  65. momSNP = set()
  66. for line in MomGenome:
  67.     snp, rest = line.split(None, 1) # Split into two pieces only
  68.     momSNP.add(snp)
  69.  
  70. for line in MyGenome:
  71.     snp, rest = line.split(None, 1)
  72.     if snp in momSNP:
  73.         print line
  74.        
  75. LineNumber = 0
  76. MomGenome = open('20110514145353.txt', 'r')
  77. for Line in MomGenome:
  78.     if LineNumber > 0:
  79.         Line = Line.strip('n')
  80.         ElementList = Line.split('t')
  81.  
  82.         momSNP.append(ElementList[0])
  83.  
  84.         LineNumber = LineNumber + 1
  85.  
  86. MomGenome.close()
  87.        
  88. with open('20110514145353.txt') as mom_genome:
  89.     next(mom_genome)    # skipping the first line
  90.     for line in mom_genome:
  91.         elements = line.strip().split('t')
  92.         mom_SNP.append(elements[0])
  93.        
  94. overlap = list((momSNP_multiset and LaurelSNP_multiset).elements())
  95.        
  96. overlap = list((momSNP_multiset & LaurelSNP_multiset).elements())
  97.        
  98. >>> from collections import Counter
  99. >>> a = Counter(a=4, b=2, c=0, d=-2)
  100. >>> b = Counter(a=2, b=0, c=0)
  101. >>> a
  102. Counter({'a': 4, 'b': 2, 'c': 0, 'd': -2})
  103. >>> b
  104. Counter({'a': 2, 'c': 0, 'b': 0})
  105. >>> a and b    # This will return b
  106. Counter({'a': 2, 'c': 0, 'b': 0})
  107. >>> c & d    # this will return the common elements
  108. Counter({'a': 2})
  109.        
  110. LaurelGenome = open('20100411230740.txt', 'r')
  111. i = 0
  112. for Line in LaurelGenome:
  113.     for item in LaurelSNP_left:
  114.         if i < 1961:
  115.             if re.match(item, Line):
  116.                 pass
  117.             else:
  118.                print Line
  119.  
  120.         i = i + 1
  121.     LineNumber = LineNumber + 1
  122.        
  123. with open('20100411230740.txt') as laural_genome:
  124.     for line in laureal_genome:
  125.         i = 0
  126.         for item in laurelSNP_left:
  127.             if i > 1960:
  128.                 break
  129.  
  130.             if line.strip().split('t')[0] == item:
  131.                 print line
  132.  
  133.             i += 1
  134.        
  135. with file('zzz.txt') as f1:
  136.     first = frozenset([i.strip() for i in f1 if i.strip()])
  137.  
  138. with file('yyy.txt') as f2:
  139.     common = [i.strip().split('t') for i in f2 if i.strip() in first]
  140.  
  141. genomes = {}
  142. for i in common:
  143.     genomes[i[0]] = i[1:]