Untitled

#
# Convert a Fasta file to a Genepop file
# Author: Tom Jenkins
#

# Install packages if required and load libraries
if(!require(pegas)){install.packages("pegas"); library(pegas)}
if(!require(seqinr)){install.packages("seqinr"); library(seqinr)}
if(!require(stringi)){install.packages("stringi"); library(stringi)}
if(!require(miscTools)){install.packages("miscTools"); library(miscTools)}


# Import Fasta using read.fasta from the seqinr package
fasta = read.fasta(file="filename.fasta", set.attributes=FALSE, as.string=TRUE)
fasta


# List of populations using the first 3 characters of the Fasta file ID
poplist = unique(substr(names(fasta), 1, 3))
poplist


# Create a dataframe to store information for each individual
pop_df = as.data.frame(substr(names(fasta), 1, 3))
colnames(pop_df) = "Indiv"
pop_df = cbind(pop_df, Haplotype=rep(NA, nrow(pop_df)))


# Import fasta file as class DNAbin
data = read.dna("filename.fasta", format="fasta")
class(data)
data


# Calculate haplotype frequecies and show the output
(h = haplotype(data))
class(h)
plot(h)


# Assign haplotypes to the samples automatically using a loop
for (i in 1:length(labels(h)))
{
  pop_df$Haplotype[attr(h, "index")[[i]]] = i
}


# Convert all haplotype numbers into Genpop alleles (e.g. 1 = 001001)
for (i in 1:nrow(pop_df))
{
  # sprintf formats the number in DataInds$HAP[i] to have 3 integer digits
  # stri_dup repeats the text twice
  pop_df$Genpop[i] = stri_dup(sprintf('%02d', pop_df$Haplotype[i]), 2)
}
head(pop_df)


# Create a dataframe containing information required for Genpop file
genpop_df = data.frame(pop_df$Indiv, Sep=",", pop_df$Genpop)
colnames(genpop_df) = c("Pop","Sep","Genotype")
genpop_df = genpop_df[order(genpop_df$Pop),] # must be in alphabetic order

# Convert to a matrix
genpop_mat = as.matrix(genpop_df)


# Count the number of individuals in each population
pop_counts = tapply(1:nrow(pop_df), pop_df$Indiv, function(x) length(unique(x)))
pop_counts = as.data.frame(pop_counts)

# Add a column totalling the cumulative sum
pop_counts$Sum = cumsum(apply(pop_counts$pop_counts, 1, sum))
pop_counts


# Create a variable containing the title, locus and pop rows
title.line = c("Example Genpop File", "", "")
loci.line = c("locus1", "", "")
pop.line = c("Pop", "", "")


# Add "Pop" between each population using a loop
for (i in 1:nrow(pop_counts)){

  # i is the row number and increases by 1 after each interation to compensate
  # for the extra row being inserted each run through the loop
  pop.row = rep(NA, nrow(pop_counts))
  pop.row[i] = pop_counts$Sum[i] + i
  genpop_mat = insertRow(genpop_mat, pop.row[i], pop.line)
}
genpop_mat

# Remove the last "Pop" row
genpop_mat = genpop_mat[-nrow(genpop_mat),]
genpop_mat


# Insert title, locus and pop rows at the beginning
genpop_mat = insertRow(genpop_mat, 1, title.line)
genpop_mat = insertRow(genpop_mat, 2, loci.line)
genpop_mat = insertRow(genpop_mat, 3, pop.line)
head(genpop_mat, 10)

# Export file
write.table(genpop_mat, file="filename.gen", quote=FALSE, col.names=F, row.names=F)


# Test
genind_test = import2genind("filename.gen")
genind_test
summary(genind_test)