Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- create_model <- function(file_name, ... ){
- gtp_data <- read.csv(file=file_name, header=FALSE, sep=",")
- gtp_data <- as.matrix(gtp_data)
- temp_mat = matrix(data=strsplit(gtp_data[1], ""))
- for (i in 2:length(gtp_data)) {
- temp_mat <- rbind(temp_mat, strsplit(gtp_data[i], ""))
- }
- gtp_data <- matrix(unlist(temp_mat), ncol = length(temp_mat[[1]]), byrow = TRUE)
- match_columns = c()
- for (i in 1:ncol(gtp_data)) {
- if (length(which(gtp_data[,i] == '-')) < nrow(gtp_data) * 0.5) {
- match_columns = append(match_columns, i)
- }
- }
- L <- length(match_columns) + 1
- temp <- c()
- for (i in 1:ncol(gtp_data)) {
- temp = append(temp, unique(gtp_data[,i]))
- }
- symbols_list = unique(temp)
- symbols_list = sort(symbols_list)
- symbols_list = symbols_list[-1]
- match_emissions = matrix(0, nrow = length(symbols_list), ncol = L)
- insert_emissions = matrix(0, nrow = length(symbols_list), ncol = L)
- rownames(match_emissions) <- symbols_list
- rownames(insert_emissions) <- symbols_list
- state_transitions = matrix(0, nrow = 9, ncol = L)
- rownames(state_transitions) <- c("M-M", "M-D", "M-I", "I-M", "I-D", "I-I", "D-M", "D-D", "D-I")
- was_previous_match = TRUE
- was_previous_insertion = FALSE
- was_previous_deletion = FALSE
- match_number = 0
- for (i in 1:nrow(gtp_data)) {
- was_previous_match = TRUE
- was_previous_insertion = FALSE
- was_previous_deletion = FALSE
- match_number = 0
- for (j in 1:ncol(gtp_data)) {
- if (j %in% match_columns) {
- if (gtp_data[i, j] == "-") {
- if (was_previous_match == TRUE) {
- state_transitions["M-D", match_number + 1] = state_transitions["M-D", match_number + 1] + 1
- }
- else if (was_previous_deletion == TRUE) {
- state_transitions["D-D", match_number + 1] = state_transitions["D-D", match_number + 1] + 1
- }
- else {
- state_transitions["I-D", match_number + 1] = state_transitions["I-D", match_number + 1] + 1
- }
- was_previous_match = FALSE
- was_previous_insertion = FALSE
- was_previous_deletion = TRUE
- }
- else {
- if (was_previous_match == TRUE) {
- state_transitions["M-M", match_number + 1] = state_transitions["M-M", match_number + 1] + 1
- match_emissions[gtp_data[i, j], match_number + 2] = match_emissions[gtp_data[i, j], match_number + 2] + 1
- }
- else if (was_previous_deletion == TRUE) {
- state_transitions["D-M", match_number + 1] = state_transitions["D-M", match_number + 1] + 1
- match_emissions[gtp_data[i, j], match_number + 2] = match_emissions[gtp_data[i, j], match_number + 2] + 1
- }
- else {
- state_transitions["I-M", match_number + 1] = state_transitions["I-M", match_number + 1] + 1
- match_emissions[gtp_data[i, j], match_number + 2] = match_emissions[gtp_data[i, j], match_number + 2] + 1
- }
- was_previous_match = TRUE
- was_previous_insertion = FALSE
- was_previous_deletion = FALSE
- }
- match_number = match_number + 1
- if (j == ncol(gtp_data)) {
- if (gtp_data[i, j] == "-") {
- state_transitions["D-M", L] = state_transitions["D-M", L] + 1
- }
- else {
- state_transitions["M-M", L] = state_transitions["M-M", L] + 1
- }
- }
- }
- else {
- if (gtp_data[i, j] != "-") {
- if (i == 1) print(match_number)
- if (was_previous_match == TRUE) {
- state_transitions["M-I", match_number + 1] = state_transitions["M-I", match_number + 1] + 1
- insert_emissions[gtp_data[i, j], match_number + 1] = insert_emissions[gtp_data[i, j], match_number + 1] + 1
- }
- else if (was_previous_deletion == TRUE) {
- state_transitions["D-I", match_number + 1] = state_transitions["D-I", match_number + 1] + 1
- insert_emissions[gtp_data[i, j], match_number + 1] = insert_emissions[gtp_data[i, j], match_number + 1] + 1
- }
- else {
- state_transitions["I-I", match_number + 1] = state_transitions["I-I", match_number + 1] + 1
- insert_emissions[gtp_data[i, j], match_number + 1] = insert_emissions[gtp_data[i, j], match_number + 1] + 1
- }
- was_previous_match = FALSE
- was_previous_insertion = TRUE
- was_previous_deletion = FALSE
- if (j == ncol(gtp_data)) {
- state_transitions["I-M", L] = state_transitions["I-M", L] + 1
- }
- }
- else {
- if (j == ncol(gtp_data)) {
- if (was_previous_match == TRUE) {
- state_transitions["M-M", L] = state_transitions["I-M", L] + 1
- }
- else if (was_previous_deletion == TRUE) {
- state_transitions["D-M", L] = state_transitions["I-M", L] + 1
- }
- else {
- state_transitions["I-M", L] = state_transitions["I-M", L] + 1
- }
- }
- }
- }
- }
- }
- for (i in 2:ncol(match_emissions)) {
- if (length(which(match_emissions[,i] == 0)) > 0) {
- match_emissions[,i] = match_emissions[,i] + 1
- }
- match_emissions[,i] = match_emissions[,i]/(sum(match_emissions[,i]))
- }
- for (i in 1:ncol(insert_emissions)) {
- if (length(which(insert_emissions[,i] == 0)) > 0) {
- insert_emissions[,i] = insert_emissions[,i] + 1
- }
- insert_emissions[,i] = insert_emissions[,i]/(sum(insert_emissions[,i]))
- }
- for (i in 1:ncol(state_transitions)) {
- if (length(which(state_transitions[1:3,i] == 0)) > 0) {
- state_transitions[1:3,i] = state_transitions[1:3,i] + 1
- }
- state_transitions[1:3,i] = state_transitions[1:3,i]/(sum(state_transitions[1:3,i]))
- }
- for (i in 1:ncol(state_transitions)) {
- if (length(which(state_transitions[4:6,i] == 0)) > 0) {
- state_transitions[4:6,i] = state_transitions[4:6,i] + 1
- }
- state_transitions[4:6,i] = state_transitions[4:6,i]/(sum(state_transitions[4:6,i]))
- }
- for (i in 2:ncol(state_transitions)) {
- if (length(which(state_transitions[7:9,i] == 0)) > 0) {
- state_transitions[7:9,i] = state_transitions[7:9,i] + 1
- }
- state_transitions[7:9,i] = state_transitions[7:9,i]/(sum(state_transitions[7:9,i]))
- }
- return(list(match_emissions, insert_emissions, state_transitions))
- }
- M1 <- create_model("/Users/Pawel/Desktop/test_data.txt")
- M2 <- create_model("/Users/Pawel/Desktop/test_data.txt")
- # 4. Report the length of both M1 and M2 models (1 point)
- # M1 length
- ncol(M1[[1]]) - 1
- # M2 length
- ncol(M2[[1]]) - 1
- # 5. Report the dimensions of your T, mE and iE for both models (1 point).
- # M1 dimensions
- dim(M1[[1]])
- dim(M1[[2]])
- dim(M1[[3]])
- # M2 dimensions
- dim(M2[[1]])
- dim(M2[[2]])
- dim(M2[[3]])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement