Advertisement
Guest User

GenerateWindows

a guest
Jun 10th, 2022
153
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Awk 6.78 KB | None | 0 0
  1. #!/usr/bin/awk -f
  2.  
  3. #This program takes a fasta and generates windows of a specified size.
  4. #Usage: GenerateWindows.awk -v WinSize=XXX WinSlide=XXX AmbProp=XXX TestData_GenerateWindows.fasta
  5. #Written by: Emil Karpinski (2022-06-09)
  6.  
  7. BEGIN {
  8.     #Sets the default print seperator in awk to be a tab
  9.     OFS = "\t";    
  10.     #Initializes an empty string to store the Name name.
  11.     Name = "";
  12.     #Intializes an empty string to store the Sequence.
  13.     Seq = "";
  14.  
  15.     ##Uses the split function which splits the string "A,C,T,G,N" into the nucleotides array based on the "," seperator.
  16.     ###I'm not sure what the purpose of N_nuc here is but it contains the value 5? Maybe this corresponds to the number of nucleotides in the analysis such that this could be extended.
  17.     N_nuc = split("A,C,T,G,N",nucleotides,",");
  18.     #Setting a flag to print the header the first time around.
  19.     Flag = 0;
  20. }
  21.  
  22. #This function counts each nucleotide and returns a tab delimited
  23. #string of counts (including 0s in the order A,C,T,G,N)
  24. ##Initializes the CountWindow function and catches the passed variable in a new one called window (typically contains the contents of the Seq variable I think)
  25. function CountWindow(window){
  26.  
  27.     ##This for loop loops from 1 to 5, acting as an index for the nucleotides array and setting the count of each nucleotide to 0. Necessary in case one of the characers isn't present
  28.     ##This count it stored in the Count array which has a BP index (i.e. Count["A"]; Count["C"])
  29.     for(nuc=1;nuc<=N_nuc;nuc++){
  30.         Count[nucleotides[nuc]] = 0;
  31.     }
  32.     ##As above splits the string stored in the window variable into the array chars, based on nothing (so should be every characters).
  33.     ##Also generates a variable n which stores the length of the chars array (i.e. the number of characters).
  34.     n=split(window,chars,"");
  35.     ##This loop loops through every character in the new chars array, and for each one does two things:
  36.     ##First it converts it to upper case (in case the fasta has lower case bases in it.
  37.     ##Second it updates the corresponding entry in the Count array if the base is a A, C, T, or G, else it updates the count of the N entry (a general catch all for N's and all other ambigious bases).
  38.     for(c=1;c<=n;c++){
  39.         char = toupper(chars[c]);
  40.         if(char ~ /[ACTG]/){
  41.             Count[char]++;
  42.         } else {Count["N"]++}
  43.     }
  44.     ##Creates a new variable called str (which is used below in creating the output), and sets it equal to the number of A's in the window.
  45.     ##This appears to be just a shortcut. You could probably do this in the loop as well, but it means more code.
  46.     str = Count["A"];
  47.     ##Loops from 2 to 5 (the value of N_nuc), and appends the value of the Count[] element consistent with the current base ot the value of str.
  48.     ###I'm pretty sure you could make str an empty string above and loop through 5 times as well.
  49.     for(nuc=2;nuc<=N_nuc;nuc++){
  50.         str=str"\t"Count[nucleotides[nuc]];
  51.     }
  52.  
  53.     ##This returns the value of string to the line that called it.
  54.     return str;
  55. }
  56. ##This is a really truncated if-statment which basically says if you encounter a ">" at the start of line "^" do the code in the block.
  57. ###Functionally I think this means that
  58. /^>/{
  59.     if(Flag == 0){
  60.         ##Generates a substring from the line with everything from the first character
  61.         Name=substr($0,2);
  62.         Output = Name "" ".txt";
  63.         FilteredOutput = Name "" "_AmbFiltered.txt";
  64.        
  65.         #Print the headers
  66.         print("Name","WinStart","WinEnd","A","C","T","G","N") > Output;
  67.         print("Name","WinStart","WinEnd","A","C","T","G","N") > FilteredOutput;
  68.         Flag = 1;
  69.     }
  70.     #Each time a header is encountered, finish processing the last Name and reset
  71.     ##This is an if statment which checks if the length of Seq is equal to zero, and if it's not then run the code.
  72.  
  73.     ##Generates a substring from the line with everything from the first character
  74.     Name=substr($0,2);
  75.     Output = Name "" ".txt";
  76.     FilteredOutput = Name "" "_AmbFiltered.txt";
  77.  
  78.     ##Sets the window start position to 1q
  79.     winpos=1;
  80.     ##Immediately stops processing the current line and goes to the next one.
  81.     ###I suspect this is just a time saver, and is unnecessary. Just stops the program from evaulating the next block of code
  82.     next;
  83. }
  84. #Each line of Sequence is appended to the current known Sequence
  85. {Seq = Seq $0 }
  86. {
  87.     #Once at least a windows worth of Sequence is loaded, count the window
  88.     #Then strip the window off the front of the loaded Sequence
  89.     while(length(Seq) >= WinSize){
  90.         ##Generates the substring of the window from the Seq starting at the first position and equal to the window size
  91.         window=substr(Seq,1,WinSize);
  92.         ##Calculates the final base in the window
  93.         winend = winpos + WinSize - 1;
  94.  
  95.         ##Prints the name of the Name, the start and end position of the window and then runs the CountWindow function (above) passing it the window substring. This returns the string containing the number of A, T, C, and Gs seperated by tabs already.
  96.         print(Name, winpos, winend, CountWindow(window)) >> Output;
  97.                 #Also printing the window if it is less than or equal to the ambiguity filter.
  98.         if(Count["N"]/WinSize <= AmbProp){
  99.             print(Name, winpos, winend, str) >> FilteredOutput;
  100.         }
  101.         ##Updates the winpos integer by adding the WinSize produces a new integer corresponding to the rightmost position outside the window.
  102.         winpos+=WinSlide;
  103.         ##Produces a new substrating from the Sequence starting at the rightmost base outside the window.
  104.         ##This uses WinSize+1 as opposed to winpos since it effectively truncates the Sequence by removing the old window. So winpos wouldn't work.
  105.         Seq = substr(Seq,WinSlide+1);
  106.     }
  107.  
  108. }
  109. END { #Finish processing any remaining Sequence
  110.     ##As above checks if there's any Sequence length (i.e. length != 0) and
  111.     if(length(Seq)>WinSize){
  112.         ##Calculates the final position in the Sequence, except here will create a window from the last tested position to the end of the Sequence (i.e. a smaller window comprising only the remainder of the Seq).
  113.         winend = winpos + length(Seq) - 1;
  114.  
  115.         ##Prints the name of the Name, the start and end position of the window and then runs the CountWindow function (above) passing it the final, partial, substring. This returns the string containing the number of A, T, C, and Gs seperated by tabs already.
  116.         print(Name, winpos, winend, CountWindow(Seq)) >> Output;
  117.        
  118.         #Also printing the window if it is less than or equal to the ambiguity filter.
  119.         if(Count["N"]/WinSize <= AmbProp){
  120.             print(Name, winpos, winend, str) >> FilteredOutput;
  121.         }
  122.     }
  123. }
  124.  
  125.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement