Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sqlite3, re, math, os
- con = sqlite3.connect(':memory:')
- db = con.cursor()
- contigs = [390,389,166,690,6907,6993,2656,3501,6990,1530,7291,7069,7277,7290,6922,7257,7126,1500,7214]
- # Set filename and open input
- in_file = open("trimmed.ace", "r")
- # Compile regex for parsing each line
- AF_re = re.compile("AF\s(?P<name>.*?)\.(?P<range>\d+-\d+)(?P<link1>(?:\.fm|\.to)\d+)(?P<link2>(?:\.fm|\.to)\d+)?\s(?P<dir>[UC])")
- # Create table
- db.execute('''CREATE TABLE 'read_index' ('name' varchar(255) NOT NULL, 'contig' int(11) NOT NULL, 'range_low' int(11) NOT NULL, 'range_high' int(11) NOT NULL);''')
- db.execute('''CREATE INDEX 'name' ON 'read_index' ('name')''')
- db.execute('''CREATE INDEX 'contig' ON 'read_index' ('contig')''')
- # Parse each line in the trimmed ace file and insert it into the database
- for line in in_file:
- if line.startswith("CO"):
- # Current line is a contig, set current_contig value
- current_contig = int(line.strip()[9:14])
- else:
- # Current line is a read, parse it and insert it into the database with the current contig
- match = AF_re.match(line.strip())
- read_name = match.group('name')
- range = map(lambda x: int(x), match.group("range").split("-"))
- db.execute("INSERT INTO read_index (name, contig, range_low, range_high) VALUES (?, ?, ?, ?);", (read_name, current_contig, range[0], range[1]))
- # Commit changes to database
- con.commit()
- # SELECT a list of all reads contained in the database that have the contigs we're looking for
- read_list = db.execute("SELECT DISTINCT * FROM read_index WHERE contig=%s" % (' OR contig='.join(map(lambda x: str(x), contigs[1:])))).fetchall()
- # Make a dictionary of reads and their associated contigs
- read_contigs = {}
- for read in read_list:
- read_contigs[str(read[0])] = map(lambda x: x[0], db.execute("SELECT contig FROM read_index WHERE name=?", (str(read[0]),)).fetchall())
- # Get the paths that each read traces out by sorting their ranges and contigs associated with each range
- read_paths = {}
- for read, contig_list in read_contigs.items():
- contigs = db.execute("SELECT contig FROM read_index WHERE name=? ORDER BY range_low, range_high", (read,)).fetchall()
- path = '->'.join(map(lambda x: str(x[0]), contigs))
- try:
- # Increment path counter
- read_paths[path] += 1
- except:
- # New path so set counter to 1
- read_paths[path] = 1
- # Close the database
- db.close()
- # Print each path and frequency
- for path, frequency in read_paths.items():
- print path, frequency
Add Comment
Please, Sign In to add comment