Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.02 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import subprocess as sub
  4. import matplotlib.pyplot as plt
  5. import matplotlib.cm as cm
  6. %matplotlib inline
  7.  
  8. # size of the initial shared file
  9. START_SIZE = 444444
  10. NUM = 10
  11. SHARED_FILE = 'Mock_synthetic.shared'
  12. OTU_NUM = 6
  13. OTU_REP = 4
  14. OTU_SIZES = [( 100000 ) // ( 10 ** n ) for n in range(OTU_NUM)[::-1]]
  15. OTU_TOTAL = OTU_NUM * OTU_REP
  16. REF_COLS = ['Otu{}'.format(o + 1) for o in range(OTU_TOTAL)]
  17. REF_DF = pd.DataFrame([0 for p in range(OTU_NUM * OTU_REP)], index=REF_COLS, columns=['REF']).transpose()
  18.  
  19. """
  20. Subsample the shared file NUM times where for each n subsample the size is ( 1 * n ) / NUM
  21.  
  22. For example, with a NUM of 10, the shared file will be subsampled 10 times, and the subsampled sizes will be 1/10, 2/10, 3/10, etc of the original shared file size as specified by START_SIZE.
  23.  
  24. """
  25.  
  26. # run mothur and parse output files
  27. output_dfs = []
  28. for i in range(NUM)[::-1]:
  29. # set subsampling size
  30. size = ( START_SIZE // NUM ) * ( i + 1 )
  31.  
  32. # format and execute mothur commands
  33. commands = [
  34. 'sub.sample(shared={}, size={})'.format(SHARED_FILE, size),
  35. # 'get.current()'
  36. ]
  37.  
  38. mothur_batch = 'mothur "#{}"'.format(';'.join(commands))
  39. sub.call(mothur_batch)
  40.  
  41. # format data and append to output_dfs list
  42. output_df = pd.read_table('{}.0.03.subsample.shared'.format(SHARED_FILE.split('.shared')[0]))
  43. output_df = output_df.drop(['label', 'Group', 'numOtus'], axis=1)
  44. output_df = pd.concat([REF_DF, output_df]).transpose().ix[REF_COLS].drop('REF', axis=1)
  45. sub_dfs = []
  46. for j in range(OTU_REP):
  47. start_ = OTU_NUM * j
  48. stop_ = OTU_NUM * ( j + 1 )
  49. sub_df = output_df.iloc[start_:stop_]
  50. sub_df.index = OTU_SIZES
  51. sub_dfs.append(sub_df)
  52.  
  53. output_df = pd.concat(sub_dfs, axis=1, ignore_index=True).fillna(0)#.sum(axis=1).div(OTU_REP)
  54. output_df.name = size
  55. # uncomment the next line to print out each dataframe describing a single subsampling
  56. # print(size, '\n', output_df)
  57. output_dfs.append(output_df)
  58.  
  59. """
  60. Plot the results of the subsampling
  61.  
  62. """
  63.  
  64. dfs_len = len(output_dfs)
  65.  
  66. # init figure
  67. fig, ax = plt.subplots(nrows=dfs_len//2 , ncols=2, sharex=True, sharey=True, figsize=(10 , dfs_len * 2))
  68.  
  69. # iterate over dfs, plotting each as a subgraph
  70. for i in range(len(output_dfs)):
  71. df = output_dfs[i]
  72.  
  73. plt.subplot(dfs_len // 2, 2, i + 1)
  74. colors = iter(cm.Paired(np.linspace(0, 1, len(df))))
  75.  
  76. dim = OTU_REP
  77. width = 0.75
  78. dimw = width / dim
  79. x = np.arange(OTU_NUM)
  80.  
  81. for j in range(dim):
  82. plt.bar(x + j * dimw, df[j], dimw, color=next(colors))
  83.  
  84. plt.title('Subsampled to {} Sequences'.format(df.name))
  85. plt.xticks(x + dimw * 2, map(str, OTU_SIZES))
  86. plt.yscale('log')
  87.  
  88. # add common labels
  89. fig.text(0.5, 0.09, 'Original OTU abundance', ha='center', fontsize=16)
  90. fig.text(0.04, 0.5, 'Sub Sampled OTU Abundance', va='center', rotation='vertical', fontsize=16)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement