Guest User

Untitled

a guest
Mar 22nd, 2018
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.68 KB | None | 0 0
  1. head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n .
  2. 1:#diseaseID
  3. 2:diseaseName
  4. 3:phenotypeId
  5. 4:phenotypeName
  6. 5:onsetId
  7. 6:onsetName
  8. 7:frequency
  9. 8:sex
  10. 9:negation
  11. 10:modifier
  12. 11:description
  13. 12:publication
  14. 13:evidence
  15. 14:assignedBy
  16. 15:dateCreated
  17.  
  18. ########################
  19.  
  20. # easier to survey the records in one pile
  21. grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab
  22.  
  23. # howmany rows
  24. wc -l < v2.tab
  25. 92,727
  26.  
  27. # right number of columns every time?
  28. awk -F'\t' 'NF!=15' v2.tab
  29. # yep
  30.  
  31. # disease identifier types
  32. cut -f1 -d':' v2.tab | sort | uniq -c | sort -nr
  33. 92430 OMIM
  34. 297 DECIPHER
  35.  
  36. # files with the most rows
  37. cut -f1 v2.tab | uniq -c | sort -nr| head
  38. 131 OMIM:312870
  39. 128 OMIM:180849
  40. 108 OMIM:607872
  41. 108 OMIM:194050
  42. 99 OMIM:613406
  43. 95 OMIM:270400
  44. 94 OMIM:194190
  45. 87 OMIM:601803
  46. 87 OMIM:122470
  47. 85 OMIM:305600
  48.  
  49. # check if any disease identifiers in more than one file?
  50. cut -f1 v2.tab | uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head
  51. 1 OMIM:617537
  52. 1 OMIM:617526
  53. 1 OMIM:617506
  54. 1 OMIM:617478
  55. 1 OMIM:617468
  56. 1 OMIM:617466
  57. 1 OMIM:617460
  58. 1 OMIM:617452
  59. 1 OMIM:617450
  60. 1 OMIM:617442
  61. # no
  62.  
  63. # summary stats on rows per file
  64. cut -f1 v2.tab | uniq -c | sort -nr| cut -c1-8 | ./sumstat.r
  65. V1
  66. Min. : 1.00
  67. 1st Qu.: 4.00
  68. Median : 8.00
  69. Mean : 12.61
  70. 3rd Qu.: 17.00
  71. Max. :131.00
  72. [1] "sd :12.46"
  73.  
  74. ###########################################
  75. # sumstat.r
  76. #! /usr/bin/Rscript --vanilla
  77. x <- read.csv('stdin', header = F);
  78. summary(x);
  79. sprintf("sd :%.02f", sd(x[,1]));
  80. ###########################################
  81.  
  82.  
  83.  
  84.  
  85.  
  86. ###########################################
  87. # diseaseID
  88. # howmany distinct disease identifiers
  89. cut -f1 v2.tab | uniq | wc -l
  90. 7351
  91.  
  92. #####################################
  93. # diseaseName
  94. # howmany distinct disease ... "word thingies"(tm)
  95. cut -f2 v2.tab | uniq | wc -l
  96. 12378
  97. # not 1:1
  98.  
  99. # sometimes appears to be a ';;' seperated list
  100. # howmany are lists?
  101. cut -f2 v2.tab | sort -u | grep -c ";;"
  102. 2437
  103.  
  104. ##########################################
  105. # phenotypeId
  106. # howmany?
  107. cut -f3 v2.tab | wc -l
  108. 92727
  109. # unique?
  110. cut -f3 v2.tab | sort -u | wc -l
  111. 6994
  112.  
  113. # have correct curie prefix?
  114. cut -f3 v2.tab | cut -f1 -d':' | uniq -c
  115. 92727 HP
  116.  
  117. ################################
  118. # phenotypeName
  119. cut -f4 v2.tab | wc -l
  120. 92727
  121. cut -f4 v2.tab | sort -u | wc -l
  122. 6994
  123.  
  124. # anything not simple words? any puncuation indicating lists?
  125. cut -f4 v2.tab | grep -v "[a-z A-Z]*"
  126. # nothing
  127. ################################
  128. # onsetId
  129. cut -f5 v2.tab | sort | uniq -c | sort -nr
  130. 92194
  131. 134 HP:0003577
  132. 97 HP:0003593
  133. 87 HP:0011463
  134. 74 HP:0003623
  135. 50 HP:0003581
  136. 45 HP:0003621
  137. 33 HP:0003584
  138. 8 HP:0011462
  139. 3 HP:0003596
  140. 1 HP:0011461
  141. 1 HP:0003674
  142.  
  143. #####################################
  144. # onsetName
  145. 92194
  146. 134 Congenital onset
  147. 97 Infantile onset
  148. 87 Childhood onset
  149. 74 Neonatal onset
  150. 50 Adult onset
  151. 45 Juvenile onset
  152. 33 Late onset
  153. 8 Young adult onset
  154. 3 Middle age onset
  155. 1 Onset
  156. 1 Fetal onset
  157.  
  158. ##########################################
  159.  
  160. Here columns reordered from last time
  161.  
  162. repo was not in sync on removed files
  163.  
  164. ################################
  165. # frequency
  166. # sparse hodgpodge of:
  167. # nothing
  168. # identifiers
  169. # rationals
  170. # percentages (including ranges of percentages)
  171.  
  172. # easy: express percentages more uniformly
  173. # hard: recover the proper rational a percentage was derived from (curating pubs)
  174. #
  175. # mixing identifiers and rationals;
  176. # I guess one sparse colomn is better than two
  177. # but what would really make it worthwhile is if the identifiers
  178. # refrenced a value (back in the ontology) which allowed them to be comparable
  179. # (even approximatly) to the proper rationals (and percentages)
  180. #
  181. # Okay we sort of of have that but uncomputably in a string
  182.  
  183. HP:0040283 "Occasional (29-5%)"
  184.  
  185. Maybe something like statements along the lines of:
  186.  
  187. <HP:0040283> <???:greater than or equal to> 0.05 .
  188. <HP:0040283> <???:less than or equal to> 0.29 .
  189.  
  190. ----------------------------------------------------
  191.  
  192. cut -f7 v2.tab | sort | uniq -c | sort -nr | head
  193. 84602
  194. 5087 HP:0040283
  195. 682 HP:0040282
  196. 179 HP:0040281
  197. 116 2/2
  198. 87 3/3
  199. 74 HP:0040280
  200. 70 7.5000%
  201. 59 1/3
  202.  
  203.  
  204.  
  205. # are all rationals proper?
  206. cut -f7 v2.tab | awk -F'/' '$2>0{if($1>$2)print}'
  207. # of course they are.
  208.  
  209. ##################################
  210. # sex
  211. cut -f8 v2.tab | sort | uniq -c | sort -nr
  212. 92647
  213. 58 Male
  214. 22 Female
  215.  
  216.  
  217. ################################
  218. # negation
  219. cut -f9 v2.tab | sort | uniq -c | sort -nr
  220. 91939
  221. 788 NO
  222.  
  223. #################################
  224. # modifier
  225. cut -f10 v2.tab | sort | uniq -c | sort -nr
  226. 91950
  227. 307 HP:0012825
  228. 194 HP:0012828
  229. 53 HP:0003676
  230. 42 HP:0025303
  231. 35 HP:0012829
  232. 27 HP:0012832
  233. 26 HP:0012833
  234. 23 HP:0031796
  235. 16 HP:0031375
  236. 15 HP:0012826
  237. 12 HP:0012840
  238. 11 HP:0012839
  239. 8 HP:0012837
  240. 5 HP:0011010
  241. 1 HP:0030650
  242. 1 HP:0012827
  243. 1 HP:0003831
  244.  
  245. ########################################
  246. # description
  247. # cut -f11 v2.tab | sort | uniq -c | sort -nr | less
  248. # 60357 empty
  249. # OMIM screaming caps (mostly descriptive)
  250. # and some other more random hint like statemets
  251. # including data that visually seems like it belongs in other columns
  252.  
  253. frequency
  254. 5% to 13%
  255. 2/7
  256.  
  257. sex
  258. in males
  259.  
  260. negation
  261. NOT
  262.  
  263. onset
  264. In infancy
  265. School age onset
  266. Onset usually before puberty
  267. Onset in early childhoos <- yep, childhoos
  268. Onset by age of three years
  269. Onset at birth or in childhood
  270. Onset about puberty
  271.  
  272.  
  273. Not sure where I would put a description consisting of "0"
  274.  
  275. ########################################
  276. # publication
  277. # hmmm... plural but more like
  278. # citations
  279.  
  280. # can be lists (with different seperator than previous list)
  281. # can be missing curie suffix `OMIM:` (seventy like this)
  282. # can be url
  283. # can be ISBN
  284. # can mix curie case `PMID:17918734;pmid:12687501`
  285. # can be spaced out `PMID: 17223397`
  286. # can be bare integer `12089525`
  287. # can be folks `HPO:sdoelken`
  288. # there can be space after list seperators (or not)
  289.  
  290. # howmany are lists:
  291. cut -f12 v2.tab | grep -c ';'
  292. 372
  293.  
  294.  
  295. ###############################################################
  296. # Note:
  297. # web search on 'GO_evidence_code' returns correct code descriptions as top hit
  298. # evidence
  299. cut -f13 v2.tab | sort | uniq -c | sort -nr
  300. 43897 TAS
  301. 42405 IEA
  302. 6400 PCS
  303. 25 ICE
  304.  
  305.  
  306. #####################################
  307. # assignedBy
  308. cut -f14 v2.tab | sort | uniq -c | sort -nr
  309. 42088 HPO:skoehler
  310. 36962 HPO:iea
  311. 13523 HPO:probinson
  312. 51 HPO:lccarmody
  313. 49 HPO:sdoelken
  314. 34 ZFIN:bruef; HPO:sdoelken
  315. 13 HPO:curators
  316. 6 PATOC:GVG; PATOC:PS
  317. 1 HPO:nvasilevsky
  318.  
  319. # Is there a good reason not to insist on ORCIDs?
  320. # surely these people must be amongst the most capable of understanding why.
  321.  
  322.  
  323. ##################################################
  324. # date_created
  325. cut -f15 v2.tab | sort | uniq -c | sort -nr | head
  326.  
  327. 40213 2009-02-17 A very busy day
  328. 7718 2017-07-13
  329. 6532 2012-10-17
  330. 2434 2015-12-30
  331. 2185 2010-06-20
  332. 1958 2010-06-19
  333. 1145 2012-11-18
  334. 1089 2010-06-18
  335. 1014 2012-04-24
  336. 942 2014-11-26
  337.  
  338. the great thing about this format is how easy it is to spot outliers
  339.  
  340. cut -f15 v2.tab | sort -u | head
  341. 2009-02-17
  342. 2009-07-24
  343. 2009-07-31
  344. 2009-08-31
  345. 2009-09-17
  346. 2009-10-01
  347. 2009-10-02
  348. 2009-10-09
  349. 2009-10-15
  350. 2009-10-16
  351.  
  352.  
  353. cut -f15 v2.tab | sort -u | tail
  354. 2017-12-11
  355. 2017-12-12
  356. 2017-12-13
  357. 2017-12-17
  358. 2017-12-22
  359. 2018-01-25
  360. 2018-01-28
  361. 2018-03-04
  362. 2018-03-05
  363. 2018-03-07
  364.  
  365.  
  366. # check the rest
  367.  
  368. for date in $(cut -f15 v2.tab | sort -u); do
  369. date --date=${date};
  370. done | grep invalid
  371. date: invalid date ‘2018-15-20’
  372.  
  373. # the dates all look valid.
Add Comment
Please, Sign In to add comment