Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import subprocess as sub
- import matplotlib.pyplot as plt
- import matplotlib.cm as cm
- %matplotlib inline
- # size of the initial shared file
- START_SIZE = 444444
- NUM = 10
- SHARED_FILE = 'Mock_synthetic.shared'
- OTU_NUM = 6
- OTU_REP = 4
- OTU_SIZES = [( 100000 ) // ( 10 ** n ) for n in range(OTU_NUM)[::-1]]
- OTU_TOTAL = OTU_NUM * OTU_REP
- REF_COLS = ['Otu{}'.format(o + 1) for o in range(OTU_TOTAL)]
- REF_DF = pd.DataFrame([0 for p in range(OTU_NUM * OTU_REP)], index=REF_COLS, columns=['REF']).transpose()
- """
- Subsample the shared file NUM times where for each n subsample the size is ( 1 * n ) / NUM
- For example, with a NUM of 10, the shared file will be subsampled 10 times, and the subsampled sizes will be 1/10, 2/10, 3/10, etc of the original shared file size as specified by START_SIZE.
- """
- # run mothur and parse output files
- output_dfs = []
- for i in range(NUM)[::-1]:
- # set subsampling size
- size = ( START_SIZE // NUM ) * ( i + 1 )
- # format and execute mothur commands
- commands = [
- 'sub.sample(shared={}, size={})'.format(SHARED_FILE, size),
- # 'get.current()'
- ]
- mothur_batch = 'mothur "#{}"'.format(';'.join(commands))
- sub.call(mothur_batch)
- # format data and append to output_dfs list
- output_df = pd.read_table('{}.0.03.subsample.shared'.format(SHARED_FILE.split('.shared')[0]))
- output_df = output_df.drop(['label', 'Group', 'numOtus'], axis=1)
- output_df = pd.concat([REF_DF, output_df]).transpose().ix[REF_COLS].drop('REF', axis=1)
- sub_dfs = []
- for j in range(OTU_REP):
- start_ = OTU_NUM * j
- stop_ = OTU_NUM * ( j + 1 )
- sub_df = output_df.iloc[start_:stop_]
- sub_df.index = OTU_SIZES
- sub_dfs.append(sub_df)
- output_df = pd.concat(sub_dfs, axis=1, ignore_index=True).fillna(0)#.sum(axis=1).div(OTU_REP)
- output_df.name = size
- # uncomment the next line to print out each dataframe describing a single subsampling
- # print(size, '\n', output_df)
- output_dfs.append(output_df)
- """
- Plot the results of the subsampling
- """
- dfs_len = len(output_dfs)
- # init figure
- fig, ax = plt.subplots(nrows=dfs_len//2 , ncols=2, sharex=True, sharey=True, figsize=(10 , dfs_len * 2))
- # iterate over dfs, plotting each as a subgraph
- for i in range(len(output_dfs)):
- df = output_dfs[i]
- plt.subplot(dfs_len // 2, 2, i + 1)
- colors = iter(cm.Paired(np.linspace(0, 1, len(df))))
- dim = OTU_REP
- width = 0.75
- dimw = width / dim
- x = np.arange(OTU_NUM)
- for j in range(dim):
- plt.bar(x + j * dimw, df[j], dimw, color=next(colors))
- plt.title('Subsampled to {} Sequences'.format(df.name))
- plt.xticks(x + dimw * 2, map(str, OTU_SIZES))
- plt.yscale('log')
- # add common labels
- fig.text(0.5, 0.09, 'Original OTU abundance', ha='center', fontsize=16)
- fig.text(0.04, 0.5, 'Sub Sampled OTU Abundance', va='center', rotation='vertical', fontsize=16)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement