Untitled

import pandas as pd
import numpy as np
import subprocess as sub
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

# size of the initial shared file
START_SIZE = 444444
NUM = 10
SHARED_FILE = 'Mock_synthetic.shared'
OTU_NUM = 6
OTU_REP = 4
OTU_SIZES = [( 100000 ) // ( 10 ** n ) for n in range(OTU_NUM)[::-1]]
OTU_TOTAL = OTU_NUM * OTU_REP
REF_COLS = ['Otu{}'.format(o + 1) for o in range(OTU_TOTAL)]
REF_DF = pd.DataFrame([0 for p in range(OTU_NUM * OTU_REP)], index=REF_COLS, columns=['REF']).transpose()

"""
Subsample the shared file NUM times where for each n subsample the size is ( 1 * n ) / NUM

For example, with a NUM of 10, the shared file will be subsampled 10 times, and the subsampled sizes will be 1/10, 2/10, 3/10, etc of the original shared file size as specified by START_SIZE.

"""

# run mothur and parse output files
output_dfs = []
for i in range(NUM)[::-1]:
    # set subsampling size
    size = ( START_SIZE // NUM ) * ( i + 1 )

    # format and execute mothur commands
    commands = [
        'sub.sample(shared={}, size={})'.format(SHARED_FILE, size),
#        'get.current()'
    ]

    mothur_batch = 'mothur "#{}"'.format(';'.join(commands))
    sub.call(mothur_batch)

    # format data and append to output_dfs list
    output_df = pd.read_table('{}.0.03.subsample.shared'.format(SHARED_FILE.split('.shared')[0]))
    output_df = output_df.drop(['label', 'Group', 'numOtus'], axis=1)
    output_df = pd.concat([REF_DF, output_df]).transpose().ix[REF_COLS].drop('REF', axis=1)
    sub_dfs = []
    for j in range(OTU_REP):
        start_ = OTU_NUM * j
        stop_ = OTU_NUM * ( j + 1 )
        sub_df = output_df.iloc[start_:stop_]
        sub_df.index = OTU_SIZES
        sub_dfs.append(sub_df)

    output_df = pd.concat(sub_dfs, axis=1, ignore_index=True).fillna(0)#.sum(axis=1).div(OTU_REP)
    output_df.name = size
	# uncomment the next line to print out each dataframe describing a single subsampling
#     print(size, '\n', output_df)
    output_dfs.append(output_df)

"""
Plot the results of the subsampling

"""

dfs_len = len(output_dfs)

# init figure
fig, ax = plt.subplots(nrows=dfs_len//2 , ncols=2, sharex=True, sharey=True, figsize=(10 , dfs_len * 2))

# iterate over dfs, plotting each as a subgraph
for i in range(len(output_dfs)):
    df = output_dfs[i]

    plt.subplot(dfs_len // 2, 2, i + 1)
    colors = iter(cm.Paired(np.linspace(0, 1, len(df))))

    dim = OTU_REP
    width = 0.75
    dimw = width / dim
    x = np.arange(OTU_NUM)

    for j in range(dim):
        plt.bar(x + j * dimw, df[j], dimw, color=next(colors))

    plt.title('Subsampled to {} Sequences'.format(df.name))
    plt.xticks(x + dimw * 2, map(str, OTU_SIZES))
    plt.yscale('log')

# add common labels
fig.text(0.5, 0.09, 'Original OTU abundance', ha='center', fontsize=16)
fig.text(0.04, 0.5, 'Sub Sampled OTU Abundance', va='center', rotation='vertical', fontsize=16)