Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Getting path for all files within a particular directory.
- files = list.files(path = dirs[j], full.names = T)
- #The files are in csv format and in the ideal case have exactly 5 columns. However, the 5th column can contain an arbitary number of commas. If I try to fread with sep = ",", certain rows can be of arbitarily high length. If I use select = 1:5 to subset each row, I lose data.
- #My solution was to read each line into a single column and then seperate into columns within the script based on the location of the first 4 commas.
- data <- rbindlist(lapply(files,fread,sep = "n",fill = T,header = F))
- #Removing empty rows.
- retain <- unlist(lapply(data, function(x) {
- str_detect(x,".")
- }))
- data[retain,] -> data
- #Removing rows where there is no data in the 5th column.
- retain <- unlist(lapply(data, function(x) {
- str_detect(trimws(x,which ='both') ,".+,.+,.*,.*,.+")
- }))
- data[retain,] -> data
- #This replaces the first 4 commas with a tab-delimiter.
- for(i in 1:4){
- data <- data.frame(lapply(data, function(x) {
- str_replace(x,",","t")
- }),stringsAsFactors = F)
- }
- #This splits the row into 5 seperate columns, always.
- data <- unlist(lapply(data, function(x) {
- unlist(strsplit(x,"t",fixed = T))
- }))
- #Changes the format from a character vector to a data table.
- data = data.frame(matrix(data,ncol=5,byrow = T),stringsAsFactors = F)
Add Comment
Please, Sign In to add comment