Advertisement
varyaaas

Untitled

Feb 16th, 2023
601
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.46 KB | None | 0 0
  1. def get_nc(line):
  2.     temp = line.split('|')
  3.     nc = temp[1][:temp[1].find('.') + 2]
  4.     return nc
  5. def get_organism_name(nc: str):
  6.     try:
  7.         base_url = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
  8.         pattern = 'lock">'
  9.  
  10.         response = requests.get(base_url.format(nc))
  11.         s = str(response.content)
  12.         pos = s.find(pattern)
  13.         if pos == -1:
  14.             return 'query'
  15.         organism_name = ''
  16.         i = pos + len(pattern)
  17.         while i < len(s) and s[i] != '<':
  18.             organism_name += s[i]
  19.             i += 1
  20.     except:
  21.         return "query"
  22.     return organism_name
  23. def do_organism_modification(organism_name):
  24.     if organism_name.rfind('(') == -1:
  25.         organism_name = organism_name.replace(' ','_')
  26.         organism_name = '|' + organism_name
  27.     else:
  28.         organism_name = '|' + organism_name[:organism_name.rfind('(')-1]
  29.         organism_name = organism_name.replace(' ','_')
  30.     return organism_name
  31. def do_everything(input_file, output_file):
  32.     with open(input_file, 'r') as inp:
  33.         with open(output_file, 'w') as outp:
  34.             for line in inp:
  35.                 if line.startswith('>'):
  36.                     nc = get_nc(line)
  37.                     organism_name = get_organism_name(nc)
  38.                     org = do_organism_modification(organism_name)
  39.                     print(line.rstrip() + org,file=outp)
  40.                 else:
  41.                     print(line, end='',file=outp)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement