- Regex searching using list elements to find matches in large document
- Traceback (most recent call last):
- File "/Users/laurelhochstetler/scripts/identify_SNPs.py", line 57, in <module>
- if re.match(item,"(.*)", Line):
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 137, in match
- return _compile(pattern, flags).match(string)
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/re.py", line 242, in _compile
- p = sre_compile.compile(pattern, flags)
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_compile.py", line 500, in compile
- p = sre_parse.parse(p, flags)
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 673, in parse
- p = _parse_sub(source, pattern, 0)
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 308, in _parse_sub
- itemsappend(_parse(source, state))
- File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sre_parse.py", line 401, in _parse
- if state.flags & SRE_FLAG_VERBOSE:
- TypeError: unsupported operand type(s) for &: 'str' and 'int'
- #!/usr/bin/env python
- import re #this imports regular expression module
- import collections
- MomGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Mary_Maloney_Full_20110514145353.txt', 'r')
- LaurelGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Laurel_Hochstetler_Full_20100411230740.txt', 'r')
- LineNumber = 0
- momSNP = []
- LaurelSNP = []
- f = open("mom_edit.txt","w")
- for Line in MomGenome:
- if LineNumber > 0:
- Line=Line.strip('n')
- ElementList=Line.split('t')
- momSNP.append(ElementList[0])
- LineNumber = LineNumber + 1
- MomGenome.close()
- for Line in LaurelGenome:
- if LineNumber > 0:
- Line=Line.strip('n')
- ElementList=Line.split('t')
- LaurelSNP.append(ElementList[0])
- LineNumber = LineNumber + 1
- momSNP_multiset = collections.Counter(momSNP)
- LaurelSNP_multiset = collections.Counter(LaurelSNP)
- overlap = list((momSNP_multiset and LaurelSNP_multiset).elements())
- momSNP_left = list((momSNP_multiset - LaurelSNP_multiset).elements())
- LaurelSNP_left = list((LaurelSNP_multiset - momSNP_multiset).elements())
- LaurelGenome=open('/Users/laurelhochstetler/Documents/genetics fun/genome_Laurel_Hochstetler_Full_20100411230740.txt', 'r')
- i = 0
- for Line in LaurelGenome:
- for item in LaurelSNP_left:
- if i < 1961:
- if re.match(item, Line):
- pass
- else:
- print Line
- i = i + 1
- LineNumber = LineNumber + 1
- momSNP = set()
- for line in MomGenome:
- snp, rest = line.split(None, 1) # Split into two pieces only
- momSNP.add(snp)
- for line in MyGenome:
- snp, rest = line.split(None, 1)
- if snp in momSNP:
- print line
- LineNumber = 0
- MomGenome = open('20110514145353.txt', 'r')
- for Line in MomGenome:
- if LineNumber > 0:
- Line = Line.strip('n')
- ElementList = Line.split('t')
- momSNP.append(ElementList[0])
- LineNumber = LineNumber + 1
- MomGenome.close()
- with open('20110514145353.txt') as mom_genome:
- next(mom_genome) # skipping the first line
- for line in mom_genome:
- elements = line.strip().split('t')
- mom_SNP.append(elements[0])
- overlap = list((momSNP_multiset and LaurelSNP_multiset).elements())
- overlap = list((momSNP_multiset & LaurelSNP_multiset).elements())
- >>> from collections import Counter
- >>> a = Counter(a=4, b=2, c=0, d=-2)
- >>> b = Counter(a=2, b=0, c=0)
- >>> a
- Counter({'a': 4, 'b': 2, 'c': 0, 'd': -2})
- >>> b
- Counter({'a': 2, 'c': 0, 'b': 0})
- >>> a and b # This will return b
- Counter({'a': 2, 'c': 0, 'b': 0})
- >>> c & d # this will return the common elements
- Counter({'a': 2})
- LaurelGenome = open('20100411230740.txt', 'r')
- i = 0
- for Line in LaurelGenome:
- for item in LaurelSNP_left:
- if i < 1961:
- if re.match(item, Line):
- pass
- else:
- print Line
- i = i + 1
- LineNumber = LineNumber + 1
- with open('20100411230740.txt') as laural_genome:
- for line in laureal_genome:
- i = 0
- for item in laurelSNP_left:
- if i > 1960:
- break
- if line.strip().split('t')[0] == item:
- print line
- i += 1
- with file('zzz.txt') as f1:
- first = frozenset([i.strip() for i in f1 if i.strip()])
- with file('yyy.txt') as f2:
- common = [i.strip().split('t') for i in f2 if i.strip() in first]
- genomes = {}
- for i in common:
- genomes[i[0]] = i[1:]