Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n .
- 1:#diseaseID
- 2:diseaseName
- 3:phenotypeId
- 4:phenotypeName
- 5:onsetId
- 6:onsetName
- 7:frequency
- 8:sex
- 9:negation
- 10:modifier
- 11:description
- 12:publication
- 13:evidence
- 14:assignedBy
- 15:dateCreated
- ########################
- # easier to survey the records in one pile
- grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab
- # howmany rows
- wc -l < v2.tab
- 92,727
- # right number of columns every time?
- awk -F'\t' 'NF!=15' v2.tab
- # yep
- # disease identifier types
- cut -f1 -d':' v2.tab | sort | uniq -c | sort -nr
- 92430 OMIM
- 297 DECIPHER
- # files with the most rows
- cut -f1 v2.tab | uniq -c | sort -nr| head
- 131 OMIM:312870
- 128 OMIM:180849
- 108 OMIM:607872
- 108 OMIM:194050
- 99 OMIM:613406
- 95 OMIM:270400
- 94 OMIM:194190
- 87 OMIM:601803
- 87 OMIM:122470
- 85 OMIM:305600
- # check if any disease identifiers in more than one file?
- cut -f1 v2.tab | uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head
- 1 OMIM:617537
- 1 OMIM:617526
- 1 OMIM:617506
- 1 OMIM:617478
- 1 OMIM:617468
- 1 OMIM:617466
- 1 OMIM:617460
- 1 OMIM:617452
- 1 OMIM:617450
- 1 OMIM:617442
- # no
- # summary stats on rows per file
- cut -f1 v2.tab | uniq -c | sort -nr| cut -c1-8 | ./sumstat.r
- V1
- Min. : 1.00
- 1st Qu.: 4.00
- Median : 8.00
- Mean : 12.61
- 3rd Qu.: 17.00
- Max. :131.00
- [1] "sd :12.46"
- ###########################################
- # sumstat.r
- #! /usr/bin/Rscript --vanilla
- x <- read.csv('stdin', header = F);
- summary(x);
- sprintf("sd :%.02f", sd(x[,1]));
- ###########################################
- ###########################################
- # diseaseID
- # howmany distinct disease identifiers
- cut -f1 v2.tab | uniq | wc -l
- 7351
- #####################################
- # diseaseName
- # howmany distinct disease ... "word thingies"(tm)
- cut -f2 v2.tab | uniq | wc -l
- 12378
- # not 1:1
- # sometimes appears to be a ';;' seperated list
- # howmany are lists?
- cut -f2 v2.tab | sort -u | grep -c ";;"
- 2437
- ##########################################
- # phenotypeId
- # howmany?
- cut -f3 v2.tab | wc -l
- 92727
- # unique?
- cut -f3 v2.tab | sort -u | wc -l
- 6994
- # have correct curie prefix?
- cut -f3 v2.tab | cut -f1 -d':' | uniq -c
- 92727 HP
- ################################
- # phenotypeName
- cut -f4 v2.tab | wc -l
- 92727
- cut -f4 v2.tab | sort -u | wc -l
- 6994
- # anything not simple words? any puncuation indicating lists?
- cut -f4 v2.tab | grep -v "[a-z A-Z]*"
- # nothing
- ################################
- # onsetId
- cut -f5 v2.tab | sort | uniq -c | sort -nr
- 92194
- 134 HP:0003577
- 97 HP:0003593
- 87 HP:0011463
- 74 HP:0003623
- 50 HP:0003581
- 45 HP:0003621
- 33 HP:0003584
- 8 HP:0011462
- 3 HP:0003596
- 1 HP:0011461
- 1 HP:0003674
- #####################################
- # onsetName
- 92194
- 134 Congenital onset
- 97 Infantile onset
- 87 Childhood onset
- 74 Neonatal onset
- 50 Adult onset
- 45 Juvenile onset
- 33 Late onset
- 8 Young adult onset
- 3 Middle age onset
- 1 Onset
- 1 Fetal onset
- ##########################################
- Here columns reordered from last time
- repo was not in sync on removed files
- ################################
- # frequency
- # sparse hodgpodge of:
- # nothing
- # identifiers
- # rationals
- # percentages (including ranges of percentages)
- # easy: express percentages more uniformly
- # hard: recover the proper rational a percentage was derived from (curating pubs)
- #
- # mixing identifiers and rationals;
- # I guess one sparse colomn is better than two
- # but what would really make it worthwhile is if the identifiers
- # refrenced a value (back in the ontology) which allowed them to be comparable
- # (even approximatly) to the proper rationals (and percentages)
- #
- # Okay we sort of of have that but uncomputably in a string
- HP:0040283 "Occasional (29-5%)"
- Maybe something like statements along the lines of:
- <HP:0040283> <???:greater than or equal to> 0.05 .
- <HP:0040283> <???:less than or equal to> 0.29 .
- ----------------------------------------------------
- cut -f7 v2.tab | sort | uniq -c | sort -nr | head
- 84602
- 5087 HP:0040283
- 682 HP:0040282
- 179 HP:0040281
- 116 2/2
- 87 3/3
- 74 HP:0040280
- 70 7.5000%
- 59 1/3
- # are all rationals proper?
- cut -f7 v2.tab | awk -F'/' '$2>0{if($1>$2)print}'
- # of course they are.
- ##################################
- # sex
- cut -f8 v2.tab | sort | uniq -c | sort -nr
- 92647
- 58 Male
- 22 Female
- ################################
- # negation
- cut -f9 v2.tab | sort | uniq -c | sort -nr
- 91939
- 788 NO
- #################################
- # modifier
- cut -f10 v2.tab | sort | uniq -c | sort -nr
- 91950
- 307 HP:0012825
- 194 HP:0012828
- 53 HP:0003676
- 42 HP:0025303
- 35 HP:0012829
- 27 HP:0012832
- 26 HP:0012833
- 23 HP:0031796
- 16 HP:0031375
- 15 HP:0012826
- 12 HP:0012840
- 11 HP:0012839
- 8 HP:0012837
- 5 HP:0011010
- 1 HP:0030650
- 1 HP:0012827
- 1 HP:0003831
- ########################################
- # description
- # cut -f11 v2.tab | sort | uniq -c | sort -nr | less
- # 60357 empty
- # OMIM screaming caps (mostly descriptive)
- # and some other more random hint like statemets
- # including data that visually seems like it belongs in other columns
- frequency
- 5% to 13%
- 2/7
- sex
- in males
- negation
- NOT
- onset
- In infancy
- School age onset
- Onset usually before puberty
- Onset in early childhoos <- yep, childhoos
- Onset by age of three years
- Onset at birth or in childhood
- Onset about puberty
- Not sure where I would put a description consisting of "0"
- ########################################
- # publication
- # hmmm... plural but more like
- # citations
- # can be lists (with different seperator than previous list)
- # can be missing curie suffix `OMIM:` (seventy like this)
- # can be url
- # can be ISBN
- # can mix curie case `PMID:17918734;pmid:12687501`
- # can be spaced out `PMID: 17223397`
- # can be bare integer `12089525`
- # can be folks `HPO:sdoelken`
- # there can be space after list seperators (or not)
- # howmany are lists:
- cut -f12 v2.tab | grep -c ';'
- 372
- ###############################################################
- # Note:
- # web search on 'GO_evidence_code' returns correct code descriptions as top hit
- # evidence
- cut -f13 v2.tab | sort | uniq -c | sort -nr
- 43897 TAS
- 42405 IEA
- 6400 PCS
- 25 ICE
- #####################################
- # assignedBy
- cut -f14 v2.tab | sort | uniq -c | sort -nr
- 42088 HPO:skoehler
- 36962 HPO:iea
- 13523 HPO:probinson
- 51 HPO:lccarmody
- 49 HPO:sdoelken
- 34 ZFIN:bruef; HPO:sdoelken
- 13 HPO:curators
- 6 PATOC:GVG; PATOC:PS
- 1 HPO:nvasilevsky
- # Is there a good reason not to insist on ORCIDs?
- # surely these people must be amongst the most capable of understanding why.
- ##################################################
- # date_created
- cut -f15 v2.tab | sort | uniq -c | sort -nr | head
- 40213 2009-02-17 A very busy day
- 7718 2017-07-13
- 6532 2012-10-17
- 2434 2015-12-30
- 2185 2010-06-20
- 1958 2010-06-19
- 1145 2012-11-18
- 1089 2010-06-18
- 1014 2012-04-24
- 942 2014-11-26
- the great thing about this format is how easy it is to spot outliers
- cut -f15 v2.tab | sort -u | head
- 2009-02-17
- 2009-07-24
- 2009-07-31
- 2009-08-31
- 2009-09-17
- 2009-10-01
- 2009-10-02
- 2009-10-09
- 2009-10-15
- 2009-10-16
- cut -f15 v2.tab | sort -u | tail
- 2017-12-11
- 2017-12-12
- 2017-12-13
- 2017-12-17
- 2017-12-22
- 2018-01-25
- 2018-01-28
- 2018-03-04
- 2018-03-05
- 2018-03-07
- # check the rest
- for date in $(cut -f15 v2.tab | sort -u); do
- date --date=${date};
- done | grep invalid
- date: invalid date ‘2018-15-20’
- # the dates all look valid.
Add Comment
Please, Sign In to add comment