Guest User

Untitled

a guest
Nov 25th, 2017
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.07 KB | None | 0 0
  1. #
  2. # Convert a Fasta file to a Genepop file
  3. # Author: Tom Jenkins
  4. #
  5.  
  6. # Install packages if required and load libraries
  7. if(!require(pegas)){install.packages("pegas"); library(pegas)}
  8. if(!require(seqinr)){install.packages("seqinr"); library(seqinr)}
  9. if(!require(stringi)){install.packages("stringi"); library(stringi)}
  10. if(!require(miscTools)){install.packages("miscTools"); library(miscTools)}
  11.  
  12.  
  13. # Import Fasta using read.fasta from the seqinr package
  14. fasta = read.fasta(file="filename.fasta", set.attributes=FALSE, as.string=TRUE)
  15. fasta
  16.  
  17.  
  18. # List of populations using the first 3 characters of the Fasta file ID
  19. poplist = unique(substr(names(fasta), 1, 3))
  20. poplist
  21.  
  22.  
  23. # Create a dataframe to store information for each individual
  24. pop_df = as.data.frame(substr(names(fasta), 1, 3))
  25. colnames(pop_df) = "Indiv"
  26. pop_df = cbind(pop_df, Haplotype=rep(NA, nrow(pop_df)))
  27.  
  28.  
  29. # Import fasta file as class DNAbin
  30. data = read.dna("filename.fasta", format="fasta")
  31. class(data)
  32. data
  33.  
  34.  
  35. # Calculate haplotype frequecies and show the output
  36. (h = haplotype(data))
  37. class(h)
  38. plot(h)
  39.  
  40.  
  41. # Assign haplotypes to the samples automatically using a loop
  42. for (i in 1:length(labels(h)))
  43. {
  44. pop_df$Haplotype[attr(h, "index")[[i]]] = i
  45. }
  46.  
  47.  
  48. # Convert all haplotype numbers into Genpop alleles (e.g. 1 = 001001)
  49. for (i in 1:nrow(pop_df))
  50. {
  51. # sprintf formats the number in DataInds$HAP[i] to have 3 integer digits
  52. # stri_dup repeats the text twice
  53. pop_df$Genpop[i] = stri_dup(sprintf('%02d', pop_df$Haplotype[i]), 2)
  54. }
  55. head(pop_df)
  56.  
  57.  
  58. # Create a dataframe containing information required for Genpop file
  59. genpop_df = data.frame(pop_df$Indiv, Sep=",", pop_df$Genpop)
  60. colnames(genpop_df) = c("Pop","Sep","Genotype")
  61. genpop_df = genpop_df[order(genpop_df$Pop),] # must be in alphabetic order
  62.  
  63. # Convert to a matrix
  64. genpop_mat = as.matrix(genpop_df)
  65.  
  66.  
  67. # Count the number of individuals in each population
  68. pop_counts = tapply(1:nrow(pop_df), pop_df$Indiv, function(x) length(unique(x)))
  69. pop_counts = as.data.frame(pop_counts)
  70.  
  71. # Add a column totalling the cumulative sum
  72. pop_counts$Sum = cumsum(apply(pop_counts$pop_counts, 1, sum))
  73. pop_counts
  74.  
  75.  
  76. # Create a variable containing the title, locus and pop rows
  77. title.line = c("Example Genpop File", "", "")
  78. loci.line = c("locus1", "", "")
  79. pop.line = c("Pop", "", "")
  80.  
  81.  
  82. # Add "Pop" between each population using a loop
  83. for (i in 1:nrow(pop_counts)){
  84.  
  85. # i is the row number and increases by 1 after each interation to compensate
  86. # for the extra row being inserted each run through the loop
  87. pop.row = rep(NA, nrow(pop_counts))
  88. pop.row[i] = pop_counts$Sum[i] + i
  89. genpop_mat = insertRow(genpop_mat, pop.row[i], pop.line)
  90. }
  91. genpop_mat
  92.  
  93. # Remove the last "Pop" row
  94. genpop_mat = genpop_mat[-nrow(genpop_mat),]
  95. genpop_mat
  96.  
  97.  
  98. # Insert title, locus and pop rows at the beginning
  99. genpop_mat = insertRow(genpop_mat, 1, title.line)
  100. genpop_mat = insertRow(genpop_mat, 2, loci.line)
  101. genpop_mat = insertRow(genpop_mat, 3, pop.line)
  102. head(genpop_mat, 10)
  103.  
  104. # Export file
  105. write.table(genpop_mat, file="filename.gen", quote=FALSE, col.names=F, row.names=F)
  106.  
  107.  
  108. # Test
  109. genind_test = import2genind("filename.gen")
  110. genind_test
  111. summary(genind_test)
Add Comment
Please, Sign In to add comment