Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_nc(line):
- temp = line.split('|')
- nc = temp[1][:temp[1].find('.') + 2]
- return nc
- def get_organism_name(nc: str):
- try:
- base_url = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
- pattern = 'lock">'
- response = requests.get(base_url.format(nc))
- s = str(response.content)
- pos = s.find(pattern)
- if pos == -1:
- return 'query'
- organism_name = ''
- i = pos + len(pattern)
- while i < len(s) and s[i] != '<':
- organism_name += s[i]
- i += 1
- except:
- return "query"
- return organism_name
- def do_organism_modification(organism_name):
- if organism_name.rfind('(') == -1:
- organism_name = organism_name.replace(' ','_')
- organism_name = '|' + organism_name
- else:
- organism_name = '|' + organism_name[:organism_name.rfind('(')-1]
- organism_name = organism_name.replace(' ','_')
- return organism_name
- def do_everything(input_file, output_file):
- with open(input_file, 'r') as inp:
- with open(output_file, 'w') as outp:
- for line in inp:
- if line.startswith('>'):
- nc = get_nc(line)
- organism_name = get_organism_name(nc)
- org = do_organism_modification(organism_name)
- print(line.rstrip() + org,file=outp)
- else:
- print(line, end='',file=outp)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement