Untitled

head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n .
1:#diseaseID
2:diseaseName
3:phenotypeId
4:phenotypeName
5:onsetId
6:onsetName
7:frequency
8:sex
9:negation
10:modifier
11:description
12:publication
13:evidence
14:assignedBy
15:dateCreated

########################

# easier to survey the records in one pile
grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab

# howmany rows
wc -l < v2.tab
92,727

# right number of columns every time?
awk -F'\t' 'NF!=15' v2.tab
# yep

# disease identifier types
cut -f1  -d':' v2.tab | sort | uniq -c | sort -nr
  92430 OMIM
    297 DECIPHER

# files with the most rows
cut -f1  v2.tab |  uniq -c | sort -nr| head
    131 OMIM:312870
    128 OMIM:180849
    108 OMIM:607872
    108 OMIM:194050
     99 OMIM:613406
     95 OMIM:270400
     94 OMIM:194190
     87 OMIM:601803
     87 OMIM:122470
     85 OMIM:305600

#  check if any disease identifiers in more than one file?
cut -f1  v2.tab |  uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head
      1 OMIM:617537
      1 OMIM:617526
      1 OMIM:617506
      1 OMIM:617478
      1 OMIM:617468
      1 OMIM:617466
      1 OMIM:617460
      1 OMIM:617452
      1 OMIM:617450
      1 OMIM:617442
# no

# summary stats on rows per file
 cut -f1  v2.tab |  uniq -c | sort -nr| cut -c1-8 | ./sumstat.r
       V1
 Min.   :  1.00
 1st Qu.:  4.00
 Median :  8.00
 Mean   : 12.61
 3rd Qu.: 17.00
 Max.   :131.00
[1] "sd :12.46"

###########################################
# sumstat.r
#! /usr/bin/Rscript --vanilla
	x <- read.csv('stdin', header = F);
	summary(x);
	sprintf("sd :%.02f", sd(x[,1]));
###########################################


###########################################
# diseaseID
# howmany distinct disease identifiers
cut -f1 v2.tab | uniq | wc -l
7351

#####################################
# diseaseName
# howmany distinct disease ... "word thingies"(tm)
cut -f2 v2.tab | uniq | wc -l
12378
# not 1:1

# sometimes appears to be a ';;' seperated list
# howmany are lists?
cut -f2 v2.tab | sort -u | grep -c  ";;"
2437

##########################################
# phenotypeId
# howmany?
cut -f3 v2.tab | wc -l
92727
# unique?
cut -f3 v2.tab | sort -u | wc -l
6994

# have correct curie prefix?
cut -f3 v2.tab | cut -f1 -d':' | uniq -c
  92727 HP

################################
# phenotypeName
cut -f4 v2.tab  | wc -l
92727
cut -f4 v2.tab  | sort -u | wc -l
6994

# anything not simple words? any puncuation indicating lists?
cut -f4 v2.tab  | grep -v "[a-z A-Z]*"
# nothing
################################
# onsetId
cut -f5 v2.tab | sort | uniq -c | sort -nr
  92194
    134 HP:0003577
     97 HP:0003593
     87 HP:0011463
     74 HP:0003623
     50 HP:0003581
     45 HP:0003621
     33 HP:0003584
      8 HP:0011462
      3 HP:0003596
      1 HP:0011461
      1 HP:0003674

#####################################
# onsetName
  92194
    134 Congenital onset
     97 Infantile onset
     87 Childhood onset
     74 Neonatal onset
     50 Adult onset
     45 Juvenile onset
     33 Late onset
      8 Young adult onset
      3 Middle age onset
      1 Onset
      1 Fetal onset

##########################################

Here columns reordered from last time

repo was not in sync on removed files

################################
# frequency
# sparse hodgpodge of:
#   nothing
#	identifiers
#	rationals
#	percentages (including ranges of percentages)

# easy: express percentages more uniformly
# hard: recover the proper rational a percentage was derived from (curating pubs)
#
# mixing identifiers and rationals;
# I guess one sparse colomn is better than two
# but what would really make it worthwhile is if the identifiers
# refrenced a value (back in the ontology) which allowed them to be comparable
# (even approximatly) to the proper rationals (and percentages)
#
# Okay we sort of of have that but uncomputably in a string

HP:0040283   "Occasional (29-5%)"

Maybe something like statements along the lines of:

<HP:0040283>  <???:greater than or equal to>  0.05 .
<HP:0040283>  <???:less than or equal to>  0.29 .

----------------------------------------------------

cut -f7 v2.tab | sort | uniq -c | sort -nr | head
 84602
   5087 HP:0040283
    682 HP:0040282
    179 HP:0040281
    116 2/2
     87 3/3
     74 HP:0040280
     70 7.5000%
     59 1/3


# are all rationals proper?
cut -f7 v2.tab | awk -F'/' '$2>0{if($1>$2)print}'
# of course they are.

##################################
# sex
cut -f8 v2.tab | sort | uniq -c | sort -nr
  92647
     58 Male
     22 Female


################################
# negation
cut -f9 v2.tab | sort | uniq -c | sort -nr
  91939
    788 NO

#################################
# modifier
cut -f10 v2.tab | sort | uniq -c | sort -nr
  91950
    307 HP:0012825
    194 HP:0012828
     53 HP:0003676
     42 HP:0025303
     35 HP:0012829
     27 HP:0012832
     26 HP:0012833
     23 HP:0031796
     16 HP:0031375
     15 HP:0012826
     12 HP:0012840
     11 HP:0012839
      8 HP:0012837
      5 HP:0011010
      1 HP:0030650
      1 HP:0012827
      1 HP:0003831

########################################
# description
# cut -f11 v2.tab | sort | uniq -c | sort -nr | less
# 60357  empty
# OMIM screaming caps (mostly descriptive)
# and some other more random hint like statemets
# including data that visually seems like it belongs in other columns

	frequency
		5% to 13%
		2/7

	sex
		in males

	negation
		NOT

	onset
		In infancy
		School age onset
		Onset usually before puberty
		Onset in early childhoos               <- yep, childhoos
      	Onset by age of three years
      	Onset at birth or in childhood
      	Onset about puberty


Not sure where I would put a description consisting of "0"

########################################
# publication
# hmmm... plural but more like
# citations

# can be lists (with different seperator than previous list)
# can be missing curie suffix  `OMIM:`  (seventy like this)
# can be url
# can be ISBN
# can mix curie case 	`PMID:17918734;pmid:12687501`
# can be spaced out   	`PMID:    17223397`
# can be bare integer  	`12089525`
# can be folks   		`HPO:sdoelken`
# there can be space after list seperators (or not)

# howmany are lists:
cut -f12 v2.tab | grep -c ';'
372


###############################################################
# Note:
# web search on 'GO_evidence_code' returns correct code descriptions as top hit
# evidence
cut -f13 v2.tab | sort | uniq -c | sort -nr
  43897 TAS
  42405 IEA
   6400 PCS
     25 ICE


#####################################
# assignedBy
cut -f14 v2.tab | sort | uniq -c | sort -nr
  42088 HPO:skoehler
  36962 HPO:iea
  13523 HPO:probinson
     51 HPO:lccarmody
     49 HPO:sdoelken
     34 ZFIN:bruef; HPO:sdoelken
     13 HPO:curators
      6 PATOC:GVG; PATOC:PS
      1 HPO:nvasilevsky

# Is there a good reason not to insist on ORCIDs?
# surely these people must be amongst the most capable of understanding why.


##################################################
# date_created
 cut -f15 v2.tab | sort | uniq -c | sort -nr | head

  40213 2009-02-17     A very busy day
   7718 2017-07-13
   6532 2012-10-17
   2434 2015-12-30
   2185 2010-06-20
   1958 2010-06-19
   1145 2012-11-18
   1089 2010-06-18
   1014 2012-04-24
    942 2014-11-26

the great thing about this format is how easy it is to spot outliers

cut -f15 v2.tab | sort -u | head
2009-02-17
2009-07-24
2009-07-31
2009-08-31
2009-09-17
2009-10-01
2009-10-02
2009-10-09
2009-10-15
2009-10-16


cut -f15 v2.tab | sort -u | tail
2017-12-11
2017-12-12
2017-12-13
2017-12-17
2017-12-22
2018-01-25
2018-01-28
2018-03-04
2018-03-05
2018-03-07


# check the rest

for date in $(cut -f15 v2.tab | sort -u); do
	date --date=${date};
done | grep invalid
date: invalid date ‘2018-15-20’

# the dates all look valid.