Advertisement
Guest User

oracle

a guest
Nov 21st, 2019
261
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 23.00 KB | None | 0 0
  1. X <- read.table("Global_25_PCA_pop_averages.txt", row.names=1, head = T, stringsAsFactors=T, na.strings=c('',' ','NA'))
  2.  
  3. single <- function(Y,k){
  4. NPOPS<-dim(X)[1]
  5.  
  6. ID <- vector(length=NPOPS)
  7. ID[1:NPOPS] <- c(as.matrix(row.names(X)))
  8.  
  9. X <- X[,1:25]
  10.  
  11. if (length(Y)==1)
  12. Y <- X[which(ID==Y),]
  13.  
  14. YDIST<-vector(length=NPOPS)
  15.  
  16. for (i in 1:NPOPS)
  17. YDIST[i] <- sqrt(sum((Y-X[i,])^2))
  18.  
  19.  
  20. ORDER<-order(YDIST)
  21. ORDER<-cbind(ID[ORDER[1:k]],round(YDIST[ORDER[1:k]],4))
  22. print(ORDER)
  23. }
  24.  
  25.  
  26.  
  27. twoWay <- function(Y,k){
  28. NPOPS<-dim(X)[1]
  29.  
  30. ID <- vector(length=NPOPS)
  31. ID[1:NPOPS] <- c(as.matrix(row.names(X)))
  32.  
  33.  
  34.  X <- X[,1:25]
  35.  
  36. if (length(Y)==1)
  37. Y <- X[which(ID==Y),]
  38.  
  39.  
  40. YDIST<-vector(length=NPOPS+NPOPS*(NPOPS-1)/2)
  41.  
  42.  
  43.  
  44.  
  45. XDIST <- as.matrix(dist(X))
  46.  
  47. for (i in 1:NPOPS)
  48. YDIST[i] <- sqrt(sum((Y-X[i,])^2))
  49.  
  50. ID2 <- vector()
  51. YDIST2 <- vector()
  52. COUNT <- NPOPS
  53.  
  54.         for (i in 1:(NPOPS-1))
  55.         for (j in (i+1):NPOPS) {
  56.             COUNT<-COUNT+1
  57.             if (abs(YDIST[i]-YDIST[j])<XDIST[i,j] & YDIST[i]>0 & YDIST[j]>0) {
  58.                 FRAC <- (XDIST[i,j]^2+YDIST[i]^2-YDIST[j]^2)/(2*XDIST[i,j]^2)
  59.                 if (FRAC>=0 & FRAC<=1) {
  60.                     YDIST2[COUNT] <- sqrt(YDIST[i]^2-FRAC^2*XDIST[i,j]^2)
  61.                     ID2[COUNT] <- paste(round(100*(1-FRAC),1),"% ",ID[i]," + ",round(100*FRAC,1),"% ",ID[j],sep=" ")
  62.                     }
  63.                 else
  64.                 YDIST2[COUNT]<- 100
  65.                 }
  66.             else
  67.             YDIST2[COUNT]<- 100
  68.             }
  69.        
  70.  
  71. ORDER<-order(YDIST2)
  72. ORDER<-cbind(ID2[ORDER[1:k]],round(YDIST2[ORDER[1:k]],4))
  73. print(ORDER)
  74. }
  75.  
  76.  
  77.  
  78. fifty <- function (Y,k)
  79.  {
  80.  NPOPS<-dim(X)[1]
  81.  ID <- vector()
  82. ID[1:NPOPS] <- c(as.matrix(row.names(X)))
  83.  X <- X[,1:25]
  84.  
  85. if (length(Y)==1)
  86. Y <- X[which(ID==Y),]
  87.  
  88.  
  89. YDIST<-vector()
  90. XDIST <- as.matrix(dist(X))
  91.  
  92. for (i in 1:NPOPS)
  93. YDIST[i] <- sqrt(sum((Y-X[i,])^2))
  94. ID2 <- vector()
  95. YDIST2 <- vector()
  96.  
  97.         COUNT <- NPOPS
  98.         for (i in 1:(NPOPS-1))
  99.         for (j in (i+1):NPOPS) {
  100.             COUNT<-COUNT+1
  101.             YDIST2[COUNT] <- sqrt(max(YDIST[i],YDIST[j])^2-(0.5)^2*XDIST[i,j]^2)
  102.             ID2[COUNT] <- paste("50%","% ",ID[i]," + ","50%","% ",ID[j])
  103.             }
  104.  
  105.  
  106. ORDER<-order(YDIST2)
  107. ORDER<-cbind(ID2[ORDER[1:k]],round(YDIST2[ORDER[1:k]],4))
  108. print(ORDER)
  109. }
  110.  
  111.  
  112.  
  113. #*******************************************************
  114. # R script nMonte.R
  115. # Find mixture composition which minimizes
  116. # the averaged genetic distance to target.
  117. # activate with: source('nMonte.R')
  118. # Use: getMonte(datafile,targetfile);
  119. # both files should be comma-separated csv.
  120. # Utilities:
  121. # subset_data(): Collecting rows from datasheet
  122. # aggr_pops(): Average populations
  123. # tab2comma(): tab-separated to comma-separated
  124. # last modified: averaging, median, percentage
  125. # v9.0 Huijbregts 29 nov 2017
  126. #*******************************************************
  127.  
  128. # default global constants
  129. batch_0  = 1000    # default rows of sample randomly drawn from data file
  130. cycles_0 = 1000   # default number of cycles
  131.  
  132. # START OF GETMONTE FUNCTION
  133. getMonte <- function(targetfile,omit='',Nbatch=batch_0,Ncycles=cycles_0,save=F) {
  134.    
  135.     # define AlGORITHM embedded function
  136.     do_algorithm <- function(selection, targ) {
  137.         mySel <- as.matrix(selection, rownames.force = NA)      
  138.         myTg <- as.matrix(targ, rownames.force = NA)
  139.         dif2targ <- sweep(mySel, 2, myTg, '-')    # data - target      
  140.         Ndata <- nrow(dif2targ)
  141.         kcol <- ncol(dif2targ)
  142.         rowLabels <- rownames(dif2targ)
  143.         # preallocate data
  144.         matPop    <- matrix(NA_integer_,  Nbatch, 1, byrow=T)
  145.         dumPop    <- matrix(NA_integer_,  Nbatch, 1, byrow=T)
  146.         matAdmix  <- matrix(NA_real_, Nbatch, kcol, byrow=T)
  147.         dumAdmix  <- matrix(NA_real_, Nbatch, kcol, byrow=T)
  148.         # initialize data for first cycle
  149.         ## a. start with nearest neighbor:
  150.         # distQQ <- rowSums(dif2targ^2)
  151.         # minPop <- as.integer(which(distQQ==min(distQQ))[1])
  152.         # matPop <- rep(minPop, Nbatch)        
  153.         # b. start with random pops:
  154.         matPop <- sample(1:Ndata,Nbatch,replace=T)        
  155.         # fill matPop with random row numbers 1:Ndata from datafile
  156.         matAdmix   <- dif2targ[matPop,]
  157.         colM1 <- colMeans(matAdmix)           # rowvector of column means
  158.         eval1 <- sum(colM1^2)    # sum of squared distances  
  159.         # Ncycles iterations
  160.         for (c in 1:Ncycles) {
  161.             # fill batch data
  162.             dumPop <- sample(1:Ndata, Nbatch, replace=T)
  163.             dumAdmix   <- dif2targ[dumPop,]
  164.             # loop thru batch
  165.             # minimize Eucl dist of batch mean to target
  166.             for (b in 1:Nbatch) {
  167.                 # test alternative pop
  168.                 store <- matAdmix[b,]
  169.                 matAdmix[b,] <- dumAdmix[b,]
  170.                 colM2 <- colMeans(matAdmix)
  171.                 eval2 <- sum(colM2^2)
  172.                 # conditional adjust
  173.                 if (eval2 <= eval1) {
  174.                     matPop[b] <- dumPop[b]
  175.                     colM1 <- colM2
  176.                     eval1 <- eval2
  177.                 } else {matAdmix[b,] <- store}
  178.             } # end batch
  179.         } # end cycles
  180.         # Collect output
  181.         # get fit of target
  182.         fitted <- t(colMeans(matAdmix) + myTg[1,])
  183.         # collect sampled populations
  184.         popX<- matPop
  185.         # substute row numbers by row names
  186.         populations <- factor(rowLabels[popX],levels=rowLabels)
  187.         # return list of 2 objects
  188.         return(list('estimated'=fitted, 'pops'=populations))
  189.     }   # end do_algorithm    
  190.    
  191.     # define OUTPUT embedded function
  192.     # except pop correlations
  193.     do_output <- function(estim, pops){
  194.         # set stdOut to sinkFile
  195.         if (save!=F) {
  196.             sinkFile <- nameIsFree(save)            
  197.             sink(sinkFile, append=T, split=T)
  198.         }
  199.         #print(paste('Ncycles=',Ncycles,sep=' '))    
  200.         #print target and estimation by col
  201.         dif <- estim - myTarget
  202.    
  203.         #matrix_out <- rbind(myTarget, estim, dif)
  204.         #rownames(matrix_out)[2:3] <- c('fitted','dif')
  205.         #print(matrix_out)    
  206.         # distance
  207.         dist1_2 <- sqrt(sum(dif^2))
  208.         dist1_2 <- dist1_2/PCT
  209.         print(paste('distance%=',round(100*dist1_2,4),sep=''))
  210.         write('',stdout())
  211.         # table percentages by pop
  212.         tgname <- row.names(myTarget)[1]
  213.         write(paste('\t',tgname),stdout())
  214.         write('',stdout())
  215.         tb <- table(pops)
  216.         tb <- tb[order(tb, decreasing=T)]
  217.         tb <- as.matrix(100*tb/Nbatch)
  218.    
  219.     i <- 1
  220.     while(tb[i]>0)
  221.          {
  222.     i <- i+1
  223.     }
  224.     tb <- tb[1:(i-1), ,drop=FALSE]
  225.    
  226.  
  227.         write.table(tb,sep=',',quote=F,col.names=F,dec='.')
  228.         # reset sinkFile to stdOut  
  229.         if (save!=F) {sink()}
  230.     }   # end do_output
  231.  
  232.     # MAIN code of getMonte
  233.     # set environment for embedded functions
  234.     # proces input
  235.     tempData <- X
  236.  
  237.     myData <- tempData[rownames(tempData)!=omit,]
  238.  
  239. if (length(targetfile)==1){
  240. NPOPS<-dim(X)[1]
  241.  
  242. ID <- vector(length=NPOPS)
  243. ID[1:NPOPS] <- c(as.matrix(row.names(X)))
  244.  
  245.  
  246. myTarget <- X[which(ID==targetfile),]
  247.  
  248. myData <- myData[-c(which(ID==targetfile)), ]
  249. }
  250.  
  251. else{
  252. myTarget <-targetfile
  253. names(myTarget) <- c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","PC15","PC16","PC17","PC18","PC19","PC20","PC21","PC22","PC23","PC24","PC25")
  254. myTarget <- t(myTarget)
  255. row.names(myTarget) <- " "
  256. }
  257.    
  258.     check_formats(myData, myTarget)
  259.     check_omit(myData, omit)    # single item distances
  260.     PCT <- ifelse(max(myData[1, ]>2), 100, 1)
  261.     #print('1. CLOSEST SINGLE ITEM DISTANCE%')
  262.     #print(nearestItems(myData, myTarget)*100/PCT)
  263.     #cat('\n')
  264.  
  265.     # full table nMonte
  266.     print('nMONTE 1')
  267.     results <- do_algorithm(myData, myTarget)
  268.     fitted <- results$estimated
  269.     populations <- results$pops
  270.     do_output(fitted, populations)
  271.     cat('\n')
  272.    
  273.     #print('CORRELATION OF ADMIXTURE POPULATIONS')
  274.     #tb <- table(populations)
  275.     #selFinal <- names(tb[tb>0])
  276.     #adFinal <- myData[selFinal,,drop=F]
  277.     # catch error
  278.     #if (nrow(adFinal)==1) {print('Only 1 population, no correlations.')}
  279.     #else {
  280.     #    corPops <- cor(t(adFinal))
  281.     #    round(corPops, digits=2)
  282.     #}
  283.  
  284. } # end getMonte function
  285.  
  286. #===================================utilities===================================
  287. #-------------------------------------------------------------------------------
  288. # function subset_data()
  289. # utility for selecting rows from datasheet
  290. # Use: subset_data('DavidMadeThis.csv', 'IselectedThis.csv' ,'Abkhasian', 'Adygei', 'Afanasievo', 'Altai_IA')
  291. # In: name of primary dataFile, name of output file, list of selected pops from primary datafile
  292. # Out: secondary datasheet with selected subset
  293. # USE WITH ONE SELECTED POPULATION TO CREATE TARGET FILE
  294. # Error message 1: non-existence or misspelled name of selected pop
  295. # Error message 2: no pops selected
  296. # Error message 3: output file exists, choose new name    
  297. #-------------------------------------------------------------------------------
  298. subset_data <- function(dataFile, saveFile, ...) {
  299.     input <- read.csv(dataFile, head=T, row.names=1, stringsAsFactor=F)
  300.     selection <- list(...)
  301.     selError <- selection[!selection %in% rownames(input)]
  302.     # test selection
  303.     if (length(selError)>0) {
  304.         cat(paste(selError,' is not a valid rowname\n',sep='')) }
  305.     # output
  306.     else {output <- input[rownames(input) %in% selection,]
  307.           print(output)                                       # print to screen
  308.           write.csv(output, nameIsFree(saveFile), quote=F)    # save to file
  309.          }
  310. }
  311.  
  312. #-------------------------------------------------------------------------------
  313. # function aggr_pops()
  314. # In the files of Davidski rowlabel has the form 'pop:ID'
  315. # This function drops the part after the colon
  316. # and collects the mean of the pop before the colon.
  317. # Use for mean:   aggr_pops(fileName)
  318. # Use for median: aggr_pops(fileName, myFunc=median)
  319. #-------------------------------------------------------------------------------
  320. aggr_pops <- function(fileName, myFunc=mean) {
  321.     myData <- read.csv(fileName, head=T, row.names=1, stringsAsFactors=FALSE)
  322.     splitted <- strsplit(rownames(myData),split=':',fixed=T)
  323.     splitmat <- t(matrix(unlist(splitted), length(splitted[[1]]), length(splitted)))
  324.     aggrData <- aggregate(myData, by=list(splitmat[,1]),myFunc)
  325.     temp <- as.matrix(aggrData[,-1]); rownames(temp) <- aggrData[,1]
  326.     return(round(temp, 7))
  327. }
  328.  
  329. #-------------------------------------------------------------------------------
  330. # function tab2comma()
  331. # Convert tab-separated csv to comma-separated csv
  332. # Use: tab2comma(tabFile,commaFile)
  333. #-------------------------------------------------------------------------------
  334. tab2comma <- function(tabFile,commaFile) {
  335.     data <- read.csv(tabFile, head=T, sep='\t', row.names=1, stringsAsFactor=F)
  336.     nameIsFree(commaFile)
  337.     write.csv(data, commaFile, row.names=T)
  338.     }
  339.  
  340. #-------------------------------------------------------------------------------
  341. # function nearestItems()
  342. # Find n best matches.
  343. # Use: inData <- read file; inTarget <- read target
  344. #       nearestItems(inData, inTarget, maxFits=8)
  345. # This is not the nearest neighbor algorithm;
  346. # when the number of items is smaller than maxFits,
  347. # functions returns all the items.
  348. #-------------------------------------------------------------------------------
  349. nearestItems <- function(inData, inTarget, maxFits=8) {
  350.     totArr <- rbind(inTarget, inData)
  351.     distMat <- as.matrix(dist(totArr, method='euclidean'))
  352.     dist1 <- distMat[,1]
  353.     sortDist <- dist1[order(dist1)]
  354.     nFits <- min(nrow(inData), maxFits)
  355.     return(sortDist[2:(nFits+1)])    
  356.     }
  357.  
  358. #==================================internal stuff===============================
  359. nameIsFree <- function(newFile) {
  360.     while (file.exists(newFile)) {
  361.         newFile <- readline('select new filename for saving (without quotes): ')
  362.         }
  363.     return(newFile)
  364.     }
  365.    
  366. check_formats <- function(sheet, target) {
  367.     if (any(is.na(sheet)))  {err_row <- as.integer(which(rowSums(is.na(sheet))>0))
  368.                             print(sheet[err_row, ])
  369.                             stop(paste('Missing value in row ',err_row))}
  370.     if (!identical(colnames(sheet), colnames(target)))
  371.         {print(colnames(sheet)); print(colnames(target))
  372.          stop('Colnames input not identical')}
  373.     }
  374.  
  375. check_omit <- function(sheet, dropInfo) {
  376.     if (dropInfo != '' & !dropInfo %in% rownames(sheet)) {
  377.         print('!!!! WARNING: Request to omit unknown popName. !!!!')
  378.         }
  379.     }
  380. #*******************************************************
  381. # R script nMonte.R
  382. # Find mixture composition which minimizes
  383. # the averaged genetic distance to target.
  384. # Penalizing of distant admixtures.
  385. # Activate with: source('nMonte3_temp.R')
  386. # Use: getMonte(datafile, targetfile)
  387. # both files should be comma-separated csv.
  388. # Utilities:
  389. # subset_data(): Collecting rows from datasheet
  390. # aggr_pops(): Average populations
  391. # tab2comma(): tab-separated to comma-separated
  392. # last modified: headStrings
  393. # v10.4 Huijbregts 8 jan 2018
  394. #*******************************************************
  395.  
  396. # default global constants
  397. batch_def  =  500   # default rows of sample randomly drawn from data file
  398. cycles_def = 1000   # default number of cycles
  399. pen_def    = 0.001  # default penalty
  400.  
  401. # START OF GETMONTE FUNCTION
  402. getMonte3 <- function(targetfile,
  403.             omit='',Nbatch=batch_def,Ncycles=cycles_def,save=F,pen=pen_def) {
  404.  
  405.     # define AlGORITHM embedded function
  406.     do_algorithm <- function(selection, targ) {
  407.         mySel <- as.matrix(selection, rownames.force = NA)      
  408.         myTg <- as.matrix(targ, rownames.force = NA)
  409.         dif2targ <- sweep(mySel, 2, myTg, '-')    # data - target      
  410.         Ndata <- nrow(dif2targ)
  411.         kcol <- ncol(dif2targ)
  412.         rowLabels <- rownames(dif2targ)
  413.         # preallocate data
  414.         matPop    <- matrix(NA_integer_,  Nbatch, 1, byrow=T)
  415.         dumPop    <- matrix(NA_integer_,  Nbatch, 1, byrow=T)
  416.         matAdmix  <- matrix(NA_real_, Nbatch, kcol, byrow=T)
  417.         dumAdmix  <- matrix(NA_real_, Nbatch, kcol, byrow=T)
  418.         matPop    <- sample(1:Ndata,Nbatch,replace=T)        
  419.         # fill matPop with random row numbers 1:Ndata from datafile
  420.         matAdmix   <- dif2targ[matPop,]
  421.         # iniatialize objective function        
  422.         colM1 <- colMeans(matAdmix)
  423.         eval1 <- (1+pen) * sum(colM1^2)  
  424.         # Ncycles iterations
  425.         for (c in 1:Ncycles) {
  426.             # fill batch data
  427.             dumPop <- sample(1:Ndata, Nbatch, replace=T)
  428.             dumAdmix   <- dif2targ[dumPop,]
  429.             # loop thru batch
  430.             # penalty is squared distance of sample to target
  431.             # objective function =
  432.             #     squared dist of batch mean to target + coef*penalty
  433.             # minimize objective function
  434.             for (b in 1:Nbatch) {
  435.                 # test alternative pop
  436.                 store <- matAdmix[b,]
  437.                 matAdmix[b,] <- dumAdmix[b,]
  438.                 colM2 <- colMeans(matAdmix)
  439.                 eval2 <- sum(colM2^2) + pen*sum(matAdmix[b, ]^2)
  440.                 # conditional adjust
  441.                 if (eval2 <= eval1) {
  442.                     matPop[b] <- dumPop[b]
  443.                     colM1 <- colM2
  444.                     eval1 <- eval2
  445.                 } else {matAdmix[b,] <- store}
  446.             } # end batch
  447.         } # end cycles
  448.         # Collect output
  449.         # get fit of target
  450.         fitted <- t(colMeans(matAdmix) + myTg[1,])
  451.         # collect sampled populations
  452.         # split labels of reference samples
  453.         popl <- headStrings(rowLabels[matPop], mySep=':')
  454.         populations <- factor(popl)
  455.         # return list of 2 objects
  456.         return(list('estimated'=fitted, 'pops'=populations))
  457.     }   # end do_algorithm    
  458.    
  459.     # define OUTPUT embedded function
  460.     # except pop correlations
  461.     do_output <- function(estim, pops){
  462.         # set stdOut to sinkFile
  463.         if (save!=F) {
  464.             sinkFile <- nameIsFree(save)            
  465.             sink(sinkFile, append=T, split=T)
  466.         }
  467.         #print(paste('Ncycles=',Ncycles,sep=' '))    
  468.         # print target and estimation by col
  469.         dif <- estim - myTarget
  470.         #matrix_out <- rbind(myTarget, estim, dif)
  471.         #rownames(matrix_out)[2:3] <- c('fitted','dif')
  472.         #print(matrix_out)    
  473.         # distance
  474.         dist1_2 <- sqrt(sum(dif^2))
  475.         dist1_2 <- dist1_2/PCT
  476.         print(paste('distance%=',round(100*dist1_2,4),sep=''))
  477.         write('',stdout())
  478.         # table percentages by pop
  479.         tgname <- row.names(myTarget)[1]
  480.         write(paste('\t',tgname),stdout())
  481.         write('',stdout())
  482.         tb <- table(pops)
  483.         tb <- tb[order(tb, decreasing=T)]
  484.         tb <- as.matrix(100*tb/Nbatch)
  485.  
  486.    
  487.  
  488.         write.table(tb,sep=',',quote=F,col.names=F,dec='.')
  489.         # reset sinkFile to stdOut  
  490.         if (save!=F) {sink()}
  491.     }   # end do_output
  492.  
  493.     # MAIN code of getMonte
  494.     # set environment for embedded functions
  495.     # proces input
  496.     tempData <- X
  497.  
  498.     myData <- tempData[rownames(tempData)!=omit,]
  499. if (length(targetfile)==1){
  500. NPOPS<-dim(X)[1]
  501.  
  502. ID <- vector(length=NPOPS)
  503. ID[1:NPOPS] <- c(as.matrix(row.names(X)))
  504.  
  505. myTarget <- X[which(ID==targetfile),]
  506.  
  507. myData <- myData[-c(which(ID==targetfile)), ]
  508. }
  509.  
  510. else{
  511. myTarget <-targetfile
  512. names(myTarget) <- c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","PC15","PC16","PC17","PC18","PC19","PC20","PC21","PC22","PC23","PC24","PC25")
  513. myTarget <- t(myTarget)
  514. row.names(myTarget) <- " "
  515. }
  516.    
  517.     check_formats(myData, myTarget)
  518.     check_omit(myData, omit)    # single item distances
  519.     PCT <- ifelse(max(myData[1, ]>2), 100, 1)
  520.     #print('1. CLOSEST SINGLE ITEM DISTANCE%')
  521.     #print(nearestItems(myData, myTarget)*100/PCT)
  522.     #cat('\n')
  523.  
  524.     # full table nMonte
  525.     print('nMONTE 3')
  526.     results <- do_algorithm(myData, myTarget)
  527.     fitted <- results$estimated
  528.     populations <- results$pops
  529.     do_output(fitted, populations)
  530.     cat('\n')
  531.    
  532.     #print('CORRELATION OF ADMIXTURE POPULATIONS')
  533.     #tb <- table(populations)
  534.     #selFinal <- names(tb[tb>0])
  535.     #adFinal <- myData[selFinal,,drop=F]
  536.     # catch error
  537.     #if (nrow(adFinal)==1) {print('Only 1 population, no correlations.')}
  538.     #else {
  539.     #    corPops <- cor(t(adFinal))
  540.     #    round(corPops, digits=2)
  541.     #}
  542.  
  543. } # end getMonte function
  544. #===================================utilities===================================
  545. #-------------------------------------------------------------------------------
  546. # function subset_data()
  547. # utility for selecting rows from datasheet
  548. # Use: subset_data('DavidMadeThis.csv', 'IselectedThis.csv' ,'Abkhasian', 'Adygei', 'Afanasievo', 'Altai_IA')
  549. # In: name of primary dataFile, name of output file, list of selected pops from primary datafile
  550. # Out: secondary datasheet with selected subset
  551. # USE WITH ONE SELECTED POPULATION TO CREATE TARGET FILE
  552. # Error message 1: non-existence or misspelled name of selected pop
  553. # Error message 2: no pops selected
  554. # Error message 3: output file exists, choose new name    
  555. #-------------------------------------------------------------------------------
  556. subset_data <- function(dataFile, saveFile, ...) {
  557.     input <- read.csv(dataFile, head=T, row.names=1, stringsAsFactor=F)
  558.     selection <- list(...)
  559.     selError <- selection[!selection %in% rownames(input)]
  560.     # test selection
  561.     if (length(selError)>0) {
  562.         cat(paste(selError,' is not a valid rowname\n',sep='')) }
  563.     # output
  564.     else {output <- input[rownames(input) %in% selection,]
  565.           print(output)                                       # print to screen
  566.           write.csv(output, nameIsFree(saveFile), quote=F)    # save to file
  567.          }
  568. }
  569.  
  570. #-------------------------------------------------------------------------------
  571. # function aggr_pops()
  572. # In the files of Davidski rowlabel has the form 'pop:ID'
  573. # This function drops the part after the colon
  574. # and collects the mean of the pop before the colon.
  575. # Use for mean:   aggr_pops(fileName)
  576. # Use for median: aggr_pops(fileName, myFunc=median)
  577. #-------------------------------------------------------------------------------
  578. aggr_pops <- function(fileName, myFunc=mean) {
  579.     myData <- read.csv(fileName, head=T, row.names=1, stringsAsFactors=FALSE)
  580.     splitted <- headStrings(rownames(myData), mySep=':')
  581.     aggrData <- aggregate(myData, by=list(splitted), myFunc)
  582.     temp <- as.matrix(aggrData[,-1]); rownames(temp) <- aggrData[,1]
  583.     return(round(temp, 7))
  584. }
  585.  
  586. #-------------------------------------------------------------------------------
  587. # function tab2comma()
  588. # Convert tab-separated csv to comma-separated csv
  589. # Use: tab2comma(tabFile,commaFile)
  590. #-------------------------------------------------------------------------------
  591. tab2comma <- function(tabFile,commaFile) {
  592.     data <- read.csv(tabFile, head=T, sep='\t', row.names=1, stringsAsFactor=F)
  593.     nameIsFree(commaFile)
  594.     write.csv(data, commaFile, row.names=T)
  595.     }
  596.  
  597. #-------------------------------------------------------------------------------
  598. # function nearestItems()
  599. # Find n best matches.
  600. # Use: inData <- read file; inTarget <- read target
  601. #       nearestItems(inData, inTarget, maxFits=8)
  602. # This is not the nearest neighbor algorithm;
  603. # when the number of items is smaller than maxFits,
  604. # functions returns all the items.
  605. #-------------------------------------------------------------------------------
  606. nearestItems <- function(inData, inTarget, maxFits=8) {
  607.     totArr <- rbind(inTarget, inData)
  608.     distMat <- as.matrix(dist(totArr, method='euclidean'))
  609.     dist1 <- distMat[,1]
  610.     sortDist <- dist1[order(dist1)]
  611.     nFits <- min(nrow(inData), maxFits)
  612.     return(sortDist[2:(nFits+1)])    
  613.     }
  614.  
  615. #==================================internal stuff===============================
  616. # split head from vector of strings, out vector of heads
  617. headStrings <- function(strVec, mySep=':') {
  618.         unlist(lapply(strsplit(strVec, mySep), function(x) x[1]))
  619.         }
  620.  
  621. nameIsFree <- function(newFile) {
  622.     while (file.exists(newFile)) {
  623.         newFile <- readline('select new filename for saving (without quotes): ')
  624.         }
  625.     return(newFile)
  626.     }
  627.    
  628. check_formats <- function(sheet, target) {
  629.     if (any(is.na(sheet)))  {err_row <- as.integer(which(rowSums(is.na(sheet))>0))
  630.                             print(sheet[err_row, ])
  631.                             stop(paste('Missing value in row ',err_row))}
  632.     if (!identical(colnames(sheet), colnames(target)))
  633.         {print(colnames(sheet)); print(colnames(target))
  634.          stop('Colnames input not identical')}
  635.     }
  636.  
  637. check_omit <- function(sheet, dropInfo) {
  638.     if (dropInfo != '' & !dropInfo %in% rownames(sheet)) {
  639.         print('!!!! WARNING: Request to omit unknown popName. !!!!')
  640.         }
  641.     }
  642.  
  643. oracle <- function(Y,TABLE='table.txt',k=20,mode="single")
  644.  {
  645. if(mode=="single" | mode=="all")
  646. {
  647. single(Y,k)
  648. }
  649.  
  650. if(mode=="2way" | mode=="all")
  651. {
  652. twoWay(Y,k)
  653. }
  654.  
  655. if(mode=="fifty" | mode=="all")
  656. {
  657. fifty(Y,k)
  658. }
  659.  
  660. if(mode=="nmonte1" | mode=="all")
  661. {
  662. getMonte(Y)
  663. }
  664.  
  665. if(mode=="nmonte3" | mode=="all")
  666. {
  667. getMonte3(Y)
  668. }
  669.  
  670. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement