Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* al2co.c
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * Department of Biochemistry
- * University of Texas Southwestern Medical Center at Dallas
- *
- * This software is freely available to the public for use.We have not placed
- * any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the University of Texas
- * Southwestern Medical Center does not and cannot warrant the performance
- * or results that may be obtained by using this software or data. The
- * University of Texas Southwestern Medical Center disclaims all warranties,
- * express or implied, including warranties of performance, merchantability
- * or fitness for any particular purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * File Name: al2co.c
- *
- * Author: Jimin Pei, Nick Grishin
- * Version Creation Date: 12/23/2000
- *
- * File Description:
- * A program to calculate positional conservation in a sequence alignment
- *
- * Last Updated: 06/03/2001 (add -e option)
- * Last Updated: 08/13/2002 (MAXSEQNUM)
- version 1.2
- Last updated: 05/06/03
- change 04.28.2011
- *
- * --------------------------------------------------------------------------*/
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <ctype.h>
- #include <string.h>
- //#include <malloc.h>
- #include <stddef.h>
- #define SQUARE(a) ((a)*(a))
- #define NUM_METHOD 9
- #define MAX_WINDOW 20
- #define MAX_DELTASITE 20
- #define MAXSTR 10001
- #define INDI -100
- #define MAXSEQNUM 50000
- char *digit="0123456789";
- void nrerror(char error_text[]);
- char *cvector(long nl, long nh);
- int *ivector(long nl, long nh);
- double *dvector(long nl, long nh);
- int **imatrix(long nrl, long nrh, long ncl, long nch);
- double **dmatrix(long nrl, long nrh, long ncl, long nch);
- double ***d3tensor(long nrl,long nrh,long ncl,long nch,long ndl,long ndh);
- int a3let2num(char *let);
- int am2num_c(int c);
- int am2num(int c);
- int am2numBZX(int c);
- static void *mymalloc(int size);
- char *strsave(char *str);
- char *strnsave(char *str, int l);
- static char **incbuf(int n, char **was);
- static int *incibuf(int n, int *was);
- void err_readali(int err_num);
- void readali(char *filename);
- static void printali(char *argt, int chunk, int n, int len, char **name, char **seq, int *start,int *csv);
- int **ali_char2int(char **aseq,int start_num, int start_seq);
- int **read_alignment2int(char *filename,int start_num,int start_seq);
- void counter(int b);
- double effective_number(int **ali, int *marks, int n, int start, int end);
- double effective_number_nogaps(int **ali, int *marks, int n, int start, int end);
- double effective_number_nogaps_expos(int **ali, int *marks, int n, int start, int end, int pos);
- void **freq(int **ali,double **f,int *num_gaps,int *effindiarr,double gap_threshold);
- double *overall_freq(int **ali, int startp, int endp, int *mark);
- double *overall_freq_wgt(int **ali,int startp,int endp,int *mark,double *wgt);
- double *h_weight(int **ali, int ip);
- double **h_freq(int **ali, double **f, double **hfr);
- double *entro_conv(double **f, int **ali, double *econv);
- double **ic_freq(int **ali, double **f, double **icf);
- double *variance_conv(double **f, int **ali, double **oaf, double *vconv);
- double *pairs_conv(double **f,int **ali,int **matrix1,int indx,double *pconv);
- typedef struct _conv_info{
- double **fq, **hfq, **icfq;
- char *alifilename;
- int alignlen;
- int nali;
- int *ngap;
- int gapless50;
- double eff_num_seq;
- double *over_all_frq;
- int *eff_indi_arr;
- double *avc,*csi;
- double ***conv;
- } conv_info;
- char **aname, **aseq;
- int nal, alilen, *astart, *alen;
- int **alignment;
- double **u_oaf,**h_oaf;
- char *am="-WFYMLIVACGPTSNQDEHRKBZX*.wfymlivacgptsnqdehrkbzx";
- char *am3[]={
- "---",
- "TRP",
- "PHE",
- "TYR",
- "MET",
- "LEU",
- "ILE",
- "VAL",
- "ALA",
- "CYS",
- "GLY",
- "PRO",
- "THR",
- "SER",
- "ASN",
- "GLN",
- "ASP",
- "GLU",
- "HIS",
- "ARG",
- "LYS",
- "ASX",
- "GLX",
- "UNK",
- "***"
- "...",
- };
- double *nmlconv(double *conv, conv_info cvf, int wn,int mi);
- int **read_aa_imatrix(FILE *fmat);
- int **identity_imat(long n);
- void argument();
- void print_parameters(FILE *outfile,char *argi,char *argo,int nt,char *argt,int argb,char *args,int argm,int argf,int argc, int argw, char *argn,char *arga, char *arge, double argg, char *argp,char *argd);
- main(int argc, char *argv[])
- {
- FILE *fout, *fpdb,*matrixfile,*fpdbout,*fp,*ft;
- conv_info convinfo;
- char fstr[MAXSTR+1];
- int i,j,k,l,nt=0;
- double sumofconv,heteroatom;
- int *ptoc1;
- char *ptoc,tmpstr[200],*convstr;
- int fcount=0, fi=0;
- int **smatrix;
- double *consv;
- double **consvall;
- int markali[MAXSEQNUM];
- char ARG_I[500],ARG_O[500],ARG_T[500],ARG_P[500],ARG_D[500],ARG_S[500],ARG_N[500],ARG_A[500],ARG_E[500];
- ARG_I[0]='\0';ARG_O[0]='\0';ARG_T[0]='\0';ARG_P[0]='\0';ARG_D[0]='\0';ARG_S[0]='\0';ARG_N[0]='\0';ARG_A[0]='\0';ARG_E[0]='\0';
- int ARG_F=2,ARG_C=0,ARG_W=1,ARG_M=0,ARG_B=60;
- double ARG_G=0.5;
- double max_index, mini_index;
- int *csv_index;
- int NAL;
- int **ALIGNMENT;
- /*read input arguments */
- if(argc<=2) { argument(); exit(0);}
- for(i=1;i<argc;i++) {
- if(strcmp(argv[i],"-i")==0) {strcpy(ARG_I,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-o")==0) {strcpy(ARG_O,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-t")==0) {strcpy(ARG_T,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-p")==0) {strcpy(ARG_P,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-d")==0) {strcpy(ARG_D,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-s")==0) {strcpy(ARG_S,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-n")==0) {strcpy(ARG_N,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-a")==0) {strcpy(ARG_A,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-e")==0) {strcpy(ARG_E,argv[i+1]);i++;continue;}
- if(strcmp(argv[i],"-f")==0) {sscanf(argv[i+1],"%d",&ARG_F);i++;continue;}
- if(strcmp(argv[i],"-b")==0) {sscanf(argv[i+1],"%d",&ARG_B);i++;continue;}
- if(strcmp(argv[i],"-c")==0) {sscanf(argv[i+1],"%d",&ARG_C);i++;continue;}
- if(strcmp(argv[i],"-w")==0) {sscanf(argv[i+1],"%d",&ARG_W);i++;continue;}
- if(strcmp(argv[i],"-m")==0) {sscanf(argv[i+1],"%d",&ARG_M);i++;continue;}
- if(strcmp(argv[i],"-g")==0) {sscanf(argv[i+1],"%lf",&ARG_G);i++;continue;}
- }
- if((ARG_F>2)||(ARG_F<0)){fprintf(stderr,"frequency calculation method(-f): \n0, unweighted; 1, Henikoff weight; 2, independent count\n");
- exit(0);}
- if((ARG_C>2)||(ARG_C<0)){fprintf(stderr,"conservation calculation strategy(-c):\n0,entropy;1,variance;2,sumofpairs\n");
- exit(0);}
- if((ARG_M>2)||(ARG_M<0)){fprintf(stderr,"matrix transform(-m):\n0, no transform;1,normalization;2,adjustment\n");
- exit(0);}
- if((ARG_G>1.0)||(ARG_G<=0)){fprintf(stderr,"gap percentage(-g) to suppress calculation must be no more than 1 and more than 0 \n");
- exit(0);}
- /* smatrix: identity matrix if reading from input failed */
- if((matrixfile=fopen(ARG_S,"r"))==NULL){
- if(strlen(ARG_S)!=0)fprintf(stderr, "Warning: not readable matrixfile: %s; default: identity matrix\n",ARG_S);
- smatrix = identity_imat(24); }
- else {smatrix=read_aa_imatrix(matrixfile);
- /*for(i=1;i<=20;i++){
- for(j=1;j<=20;j++){
- fprintf(stderr, "%d ",smatrix[i][j]);
- }
- fprintf(stderr,"\n");
- }*/
- fclose(matrixfile); }
- /* read alignment */
- ALIGNMENT = read_alignment2int(ARG_I,1,1);
- if( (strcmp(ARG_E, "T") == 0) || (strcmp(ARG_E, "t")==0) ) {
- alignment = read_alignment2int(ARG_I,1,1);
- alignment = alignment + 1;
- NAL = nal;
- nal = nal -1;
- }
- else {
- alignment=read_alignment2int(ARG_I,1,1);
- NAL = nal;
- }
- if(nal>=MAXSEQNUM) {
- fprintf(stderr, "Error: Number of sequences exceeds %d\n", MAXSEQNUM);
- exit(0);
- }
- if(alignment==NULL){
- fprintf(stderr, "alignment file not readable\n");
- }
- /* memory allocation for the elements in convinfo*/
- convinfo.alignlen=alilen;
- convinfo.nali=nal;
- convinfo.ngap=ivector(0,alilen);
- convinfo.eff_indi_arr=ivector(0,alilen+1);
- convinfo.alifilename=cvector(0,strlen(ARG_I));
- strcpy(convinfo.alifilename,ARG_I);
- convinfo.over_all_frq=dvector(0,20);
- convinfo.fq=dmatrix(0,20,0,alilen);
- convinfo.hfq=dmatrix(0,20,0,alilen);
- convinfo.icfq=dmatrix(0,20,0,alilen);
- convinfo.conv=d3tensor(0,MAX_WINDOW,0,NUM_METHOD,0,alilen);
- convinfo.avc=dvector(0,NUM_METHOD);
- convinfo.csi=dvector(0,NUM_METHOD);
- consv=dvector(0,alilen);
- consvall=dmatrix(0,NUM_METHOD,0,alilen);/* for all 9 methods */
- csv_index=ivector(0,alilen);
- for(i=1;i<=MAX_WINDOW;i++)
- for(j=1;j<=NUM_METHOD;j++)
- for(k=0;k<=alilen;k++)
- convinfo.conv[i][j][k]=INDI;
- /* get the frequencies */
- freq(alignment,convinfo.fq,convinfo.ngap,convinfo.eff_indi_arr,ARG_G);
- h_freq(alignment,convinfo.fq,convinfo.hfq);
- ic_freq(alignment,convinfo.fq,convinfo.icfq);
- convinfo.gapless50=0;
- for(i=1;i<=alilen;i++){
- if(convinfo.ngap[i]<=0.5*convinfo.nali)
- convinfo.gapless50++;
- }
- for(i=1;i<=nal;i++){markali[i]=1;}
- convinfo.over_all_frq=overall_freq(alignment,1,alilen,markali);
- /* get the conservation indeces for method[1..NUM_METHOD] */
- entro_conv(convinfo.fq,alignment,convinfo.conv[1][1]);
- entro_conv(convinfo.hfq,alignment,convinfo.conv[1][2]);
- entro_conv(convinfo.icfq,alignment,convinfo.conv[1][3]);
- variance_conv(convinfo.fq,alignment,u_oaf,convinfo.conv[1][4]);
- variance_conv(convinfo.hfq,alignment,h_oaf,convinfo.conv[1][5]);
- variance_conv(convinfo.icfq,alignment,u_oaf,convinfo.conv[1][6]);
- pairs_conv(convinfo.fq,alignment,smatrix,ARG_M,convinfo.conv[1][7]);
- pairs_conv(convinfo.hfq,alignment,smatrix,ARG_M,convinfo.conv[1][8]);
- pairs_conv(convinfo.icfq,alignment,smatrix,ARG_M,convinfo.conv[1][9]);
- /* average over window size ARG_W */
- for(i=1;i<=9;i++){
- sumofconv=0;
- k=0;
- for(j=1;j<=alilen;j++){
- if(convinfo.conv[1][i][j]==INDI)continue;
- sumofconv+=convinfo.conv[1][i][j];
- k++;
- }
- convinfo.avc[i]=sumofconv/k;
- }
- if(ARG_W>1){
- ptoc1=convinfo.eff_indi_arr;
- for(i=1;i<=NUM_METHOD;i++){
- j=ARG_W;
- if(j>*ptoc1){fprintf(stderr,"window size too big!\nARG_W:%d; ptoc1:%d\n",j,*ptoc1);exit(0);}
- for(k=(j+1)/2;k<=convinfo.eff_indi_arr[0]-j/2;k++){
- sumofconv=0;
- for(l=k-(j-1)/2;l<=k+j/2;l++){
- sumofconv+=convinfo.conv[1][i][*(ptoc1+l)];
- }
- convinfo.conv[j][i][*(ptoc1+k)]=sumofconv/j;
- }
- for(k=1;k<(j+1)/2;k++){
- sumofconv=0;
- for(l=1;l<=2*k-1;l++){
- sumofconv+=convinfo.conv[1][i][*(ptoc1+l)];
- }
- convinfo.conv[j][i][*(ptoc1+k)]=convinfo.avc[i]+(sumofconv/(2*k-1)-convinfo.avc[i])*sqrt(1.0*(2*k-1)/ARG_W);
- }
- for(k=0;k<=j/2-1;k++){
- sumofconv=0;
- for(l=1;l<=2*k+1;l++){
- sumofconv+=convinfo.conv[1][i][*(ptoc1+convinfo.eff_indi_arr[0]-l+1)];
- }
- convinfo.conv[j][i][*(ptoc1+convinfo.eff_indi_arr[0]-k)]=convinfo.avc[i]+(sumofconv/(2*k+1)-convinfo.avc[i])*sqrt(1.0*(2*k+1)/ARG_W);
- }
- }
- }
- /* normalize the conservation indices */
- nmlconv(consv, convinfo, ARG_W, ARG_C*3+ARG_F+1);
- if((strcmp(ARG_N,"F")==0)||(strcmp(ARG_N,"f")==0)){
- for(i=1;i<=convinfo.alignlen;i++){
- if(convinfo.conv[ARG_W][ARG_C*3+ARG_F+1][i]==INDI){
- consv[i]=0-convinfo.csi[ARG_C*3+ARG_F+1]+convinfo.avc[ARG_C*3+ARG_F+1];}
- else consv[i]=convinfo.conv[ARG_W][ARG_C*3+ARG_F+1][i];
- }
- }
- /* find the largest and the smallest conservation indices */
- max_index=mini_index=consv[1];
- for(i=2;i<=alilen;i++) {
- if(consv[i]>max_index) max_index=consv[i];
- if(consv[i]<mini_index) mini_index=consv[i];
- }
- for(i=1;i<=alilen;i++) {
- csv_index[i]=(int)(9.99*(consv[i]-mini_index)/(max_index-mini_index));
- }
- /* print file with conservation indices mapped to the alignment */
- if((ft=fopen(ARG_T,"w"))==NULL){;}
- else{
- nt=1;
- fclose(ft);
- if(ARG_B){;}
- else{ARG_B=60;}
- /*printali(ARG_T, ARG_B, nal, alilen, aname, aseq, astart, csv_index);*/
- printali(ARG_T, ARG_B, NAL, alilen, aname, aseq, astart, csv_index);
- }
- /* change b-factor to conservation index */
- if((fpdb=fopen(ARG_P,"r"))==NULL){
- if(strlen(ARG_P)!=0){fprintf(stderr,"please give the right pdb file\n");
- }
- }
- else{
- if((fpdbout=fopen(ARG_D,"w"))==NULL){
- if(strlen(ARG_D)!=0){fprintf(stderr,"output pdb file using default name\n");}
- fpdbout=stdout;}
- convstr=cvector(0,200);
- while(fgets(fstr,MAXSTR,fpdb)!=NULL){
- if(strncmp(fstr,"ATOM ",7)==0)break;
- }
- if(feof(fpdb)){fprintf(stderr,"end of the file reached: not a good pdb file");exit(0);}
- ptoc=fstr+23;
- j=atoi(ptoc);i=1;
- for(i=1;ALIGNMENT[1][i]==0;i++);
- ptoc=fstr+17;
- if(strncmp(ptoc,am3[ALIGNMENT[1][i]],3)!=0){
- fprintf(stderr,"The first sequence does not match the pdb file\n");
- exit(0); }
- while(!feof(fpdb)){
- ptoc=fstr+66;
- strcpy(tmpstr, ptoc);
- *(fstr+61)='\0';
- /*if(strncmp(fstr,"HETATM ",7)==0){
- heteroatom=-1.0;
- sprintf(convstr,"%5.2f",heteroatom);
- strcat(fstr,convstr);
- strcat(fstr,tmpstr);
- fprintf(fpdbout,"%s",fstr);}
- else{*/
- if(consv[i]>-10){sprintf(convstr,"%5.2f",consv[i]);}
- else{consv[i]=-9.9;sprintf(convstr,"%5.2f",consv[i]);}
- strcat(fstr,convstr);
- strcat(fstr,tmpstr);
- fprintf(fpdbout,"%s",fstr);
- /*}*/
- fgets(fstr,MAXSTR,fpdb);
- if(strncmp(fstr,"ATOM ",7)!=0){
- if(strncmp(fstr,"HETATM ",7)==0){;}
- else{fprintf(fpdbout,"END\n");break;}
- }
- ptoc=fstr+23;
- if(j!=atoi(ptoc)) {
- /*fprintf(stderr,"i=%d ptoc=%d\n", i, atoi(ptoc));*/
- /* change 04.28.2011 */
- /*i=i+atoi(ptoc)-j;j=atoi(ptoc);*/
- i++;j=atoi(ptoc);
- for(;ALIGNMENT[1][i]==0;i++);
- }
- ptoc=fstr+17;
- if(strncmp(ptoc,am3[ALIGNMENT[1][i]],3)!=0){
- if(strncmp(fstr,"HETATM ",7)==0){;}
- if(i>alilen) {fprintf(fpdbout,"END\n");break;}
- if(strncmp(ptoc,am3[ALIGNMENT[1][i]],3)!=0){
- if(strncmp(fstr,"HETATM ",7)!=0){
- fprintf(stderr,"The first sequence does not match the pdb file\n");
- fprintf(stderr,"%s %s", am3[ALIGNMENT[1][i]], ptoc);
- exit(0);
- }
- }
- }
- }
- fclose(fpdb);fclose(fpdbout);
- }
- /* print conservation indices */
- if( (strcmp(ARG_A,"T")==0)||(strcmp(ARG_A,"t")==0) ) {
- /* print results from all the 9 methods */
- for(j=1;j<=NUM_METHOD;j++) {
- nmlconv(consvall[j], convinfo, ARG_W, j);
- if((strcmp(ARG_N,"F")==0)||(strcmp(ARG_N,"f")==0)){
- for(i=1;i<=convinfo.alignlen;i++){
- if(convinfo.conv[ARG_W][j][i]==INDI)
- consvall[j][i]=0-convinfo.csi[j]+convinfo.avc[j];
- else consvall[j][i]=convinfo.conv[ARG_W][j][i];
- }
- }
- }
- if((fout=fopen(ARG_O,"w"))==NULL) fout = stdout;
- for(i=1;i<=convinfo.alignlen;i++){
- fprintf(fout, "%-5d %c ",i,am[alignment[1][i]]);
- for(j=1;j<=NUM_METHOD;j++) {
- fprintf(fout, "%8.3f", consvall[j][i]);
- }
- if(convinfo.conv[ARG_W][1][i]==INDI){
- fprintf(fout," *\n");
- } else fprintf(fout, "\n");
- }
- fprintf(fout,"* gap fraction no less than %5.2f; conservation set to M-S\n", ARG_G);
- fprintf(fout," M: mean; S: standard deviation\n");
- print_parameters(fout,ARG_I,ARG_O,nt,ARG_T,ARG_B,ARG_S,ARG_M,ARG_F,ARG_C,ARG_W,ARG_N,ARG_A,ARG_E,ARG_G,ARG_P,ARG_D);
- fclose(fout);
- }
- else {
- if((fout=fopen(ARG_O,"w"))==NULL){
- for(i=1;i<=convinfo.alignlen;i++){
- if(convinfo.conv[ARG_W][ARG_C*3+ARG_F+1][i]==INDI){
- fprintf(stdout, "%-5d %c %6.3f *\n", i,am[alignment[1][i]], consv[i]);
- }
- else{
- fprintf(stdout, "%-5d %c %6.3f\n", i,am[alignment[1][i]], consv[i]);
- }
- }
- fprintf(stdout,"* gap fraction no less than %5.2f; conservation set to M-S\n", ARG_G);
- fprintf(stdout," M: mean; S: standard deviation\n");
- print_parameters(stdout,ARG_I,ARG_O,nt,ARG_T,ARG_B,ARG_S,ARG_M,ARG_F,ARG_C,ARG_W,ARG_N,ARG_A,ARG_E,ARG_G,ARG_P,ARG_D);
- }
- else{
- for(i=1;i<=convinfo.alignlen;i++){
- if(convinfo.conv[ARG_W][ARG_C*3+ARG_F+1][i]==INDI){
- fprintf(fout, "%-5d %c %6.3f *\n", i,am[alignment[1][i]], consv[i]);
- }
- else{
- fprintf(fout, "%-5d %c %6.3f\n", i,am[alignment[1][i]], consv[i]);
- }
- }
- fprintf(fout,"* gap fraction no less than %5.2f; conservation set to M-S\n", ARG_G);
- fprintf(fout," M: mean; S: standard deviation\n");
- print_parameters(fout,ARG_I,ARG_O,nt,ARG_T,ARG_B,ARG_S,ARG_M,ARG_F,ARG_C,ARG_W,ARG_N,ARG_A,ARG_E,ARG_G,ARG_P,ARG_D);
- fclose(fout);
- }
- }
- /*print_parameters(stderr,ARG_I,ARG_O,nt,ARG_T,ARG_B,ARG_S,ARG_M,ARG_F,ARG_C,ARG_W,ARG_N,ARG_G,ARG_P,ARG_D);*/
- exit(0);
- }
- double *nmlconv(double *conv, conv_info cvf, int wn,int mi)
- {
- double mean=0,vc=0;
- int i,cps=0;
- for(i=1;i<=cvf.alignlen;i++){
- if(cvf.conv[wn][mi][i]==INDI) continue;
- cps++;
- mean+=cvf.conv[wn][mi][i];
- }
- if(cps<=1) {fprintf(stderr,"less than one position\n");exit(0);}
- mean=mean/cps;
- /*fprintf(stderr, "mean value is: %f\n",mean);*/
- cvf.avc[mi]=mean;
- for(i=1;i<=cvf.alignlen;i++){
- if(cvf.conv[wn][mi][i]==INDI) continue;
- vc+=(cvf.conv[wn][mi][i]-mean)*(cvf.conv[wn][mi][i]-mean);
- }
- vc=sqrt(vc/(cps-1));
- /*fprintf(stderr, "sigma is: %f\n",vc);*/
- cvf.csi[mi]=vc;
- if(vc==0){fprintf(stderr,"variance is zero\n");exit(0);}
- for(i=1;i<=cvf.alignlen;i++){
- if(cvf.conv[wn][mi][i]==INDI) {conv[i]=-1.0;continue;}
- if(vc!=0)conv[i]=(cvf.conv[wn][mi][i]-mean)/vc;
- else conv[i]=0;
- }
- }
- int **read_aa_imatrix(FILE *fmat){
- /* read matrix from file *fmat */
- int i,ri,rj,c,flag,j,t;
- int col[31],row[31];
- char stri[11];
- int **mat;
- mat=imatrix(0,25,0,25);
- for(i=0;i<=25;++i)for(j=0;j<=25;++j)mat[i][j]=0;
- i=0;
- ri=0;
- rj=0;
- flag=0;
- while( (c=getc(fmat)) != EOF){
- if(flag==0 && c=='#'){flag=-1;continue;}
- else if(flag==-1 && c=='\n'){flag=0;continue;}
- else if(flag==-1){continue;}
- else if(flag==0 && c==' '){flag=1;continue;}
- else if(flag==1 && c=='\n'){flag=0;continue;}
- else if(flag==1 && c==' '){continue;}
- else if(flag==1){
- ++i;
- if(i>=25){nrerror("matrix has more than 25 columns: FATAL");exit(0);}
- col[i]=am2numBZX(c);
- continue;
- }
- else if(flag==0 && c!=' ' && c!='#'){
- ri=0;
- ++rj;
- if(rj>=25){nrerror("matrix has more than 25 rows: FATAL");exit(0);}
- row[rj]=am2numBZX(c);
- flag=2;
- continue;
- }
- else if (flag==2 && c==' '){for(i=0;i<=10;++i){stri[i]=' ';}j=0;continue;}
- else if (flag==2 && c=='\n'){flag=0;continue;}
- else if (flag==2){flag=3;stri[++j]=c;if(j>10){nrerror("string too long:FATAL");exit(0);}continue;}
- else if (flag==3 && c==' ' || flag==3 && c=='\n'){
- j=0;
- ++ri;
- t=atoi(stri);
- mat[row[rj]][col[ri]]=t;
- if (c=='\n')flag=0;else flag=2;
- continue;
- }
- else if (flag==3){stri[++j]=c;continue;}
- }
- return mat;
- }
- int **identity_imat(long n){
- /* allocates square integer identity matrix of length n+1: m[0.n][0.n] */
- int i,j;
- int **m;
- m=imatrix(0,n,0,n);
- for(i=0;i<=n;++i)for(j=0;j<=n;++j){if(i==j)m[i][j]=1;else m[i][j]=0;}
- return m;
- }
- void argument()
- {
- fprintf(stderr," al2co arguments:\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -i Input alignment file [File in]\n");
- fprintf(stderr," Format: ClustalW or simple alignment format\n");
- fprintf(stderr," The title (first line) should begin with \"CLUSTAL\", or\n");
- fprintf(stderr," the title line should be deleted.\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -o Output file with conservation index for each position in the\n");
- fprintf(stderr," alignment [File out] Optional\n");
- fprintf(stderr," Default = STDOUT\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -t Output file with conservation index mapped to the alignment\n");
- fprintf(stderr," [File out] Optional\n");
- fprintf(stderr," Conservation indices are linearly rescaled to be from 0\n");
- fprintf(stderr," to 9.99. C'=9.99*(C-MIN)/(MAX-MIN), where C and C' are the\n");
- fprintf(stderr," the indices before and after rescaling respectively, MAX and\n");
- fprintf(stderr," MIN are the highest index and lowest index before rescaling\n");
- fprintf(stderr," respectively. The integer part of each rescaled index is\n");
- fprintf(stderr," written out along with the sequence alignment.\n");
- fprintf(stderr," Default = no output\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -b Block size of the output alignment file with conservation\n");
- fprintf(stderr," [Integer] Optional\n");
- fprintf(stderr," Default = 60\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -s Input file with the scoring matrix [File in] Optional\n");
- fprintf(stderr," Format: NCBI\n");
- fprintf(stderr," Notice: Scoring matrix is only used for sum-of-pairs measure\n");
- fprintf(stderr," with option -c 2.\n");
- fprintf(stderr," Default = identity matrix\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -m Scoring matrix transformation [Integer] Optional\n");
- fprintf(stderr," Options:\n");
- fprintf(stderr," 0=no transformation,\n");
- fprintf(stderr," 1=normalization S'(a,b)=S(a,b)/sqrt[S(a,a)*S(b,b)],\n");
- fprintf(stderr," 2=adjustment S\"(a,b)=2*S(a,b)-(S(a,a)+S(b,b))/2\n");
- fprintf(stderr," Default = 0\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -f Weighting scheme for amino acid frequency estimation [Integer] Optional\n");
- fprintf(stderr," Options:\n");
- fprintf(stderr," 0=unweighted,\n");
- fprintf(stderr," 1=weighted by the modified method of Henikoff & Henikoff (2,3),\n");
- fprintf(stderr," 2=independent-count based (1)\n");
- fprintf(stderr," Default = 2\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -c Conservation calculation method [Integer] Optional\n");
- fprintf(stderr," Options:\n");
- fprintf(stderr," 0=entropy-based C(i)=sum_{a=1}^{20}f_a(i)*ln[f_a(i)], where f_a(i)\n");
- fprintf(stderr," is the frequency of amino acid a at position i,\n");
- fprintf(stderr," 1=variance-based C(i)=sqrt[sum_{a=1}^{20}(f_a(i)-f_a)^2], where f_a\n");
- fprintf(stderr," is the overall frequency of amino acid a,\n");
- fprintf(stderr," 2=sum-of-pairs measure C(i)=sum_{a=1}^{20}sum_{b=1}^{20}f_a(i)*f_b(i)*S_{ab},\n");
- fprintf(stderr," where S_{ab} is the element of a scoring matrix for amino acids a and b\n");
- fprintf(stderr," Default = 0\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -w Window size used for averaging [Integer] Optional\n");
- fprintf(stderr," Default = 1\n");
- fprintf(stderr," Recommended value for motif analysis: 3\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -n Normalization option [T/F] Optional\n");
- fprintf(stderr," Subtract the mean from each conservation index and divide by the\n");
- fprintf(stderr," standard deviation.\n");
- fprintf(stderr," Default = T\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -a All methods option [T/F] Optional\n");
- fprintf(stderr," If set to true, the results of all 9 methods will be output.\n");
- fprintf(stderr," 1. unweighted entropy measure; 2. Henikoff entropy measure;\n");
- fprintf(stderr," 3. independent count entropy measure;\n");
- fprintf(stderr," 4. unweighted variance measure; 5. Henikoff variance measure;\n");
- fprintf(stderr," 6. independent count variance measure;\n");
- fprintf(stderr," 7. unweighted matrix-based sum-of-pairs measure;\n");
- fprintf(stderr," 8. Henikoff matrix-based sum-of-pairs measure;\n");
- fprintf(stderr," 9. independent count matrix-based sum-of-pairs measure;\n");
- fprintf(stderr," Default = F\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -e Excluding the first sequence from calculation [T/F] Optional\n");
- fprintf(stderr," If set to true, the first sequence in the alignment will not\n");
- fprintf(stderr," be included in the conservation calculation.\n");
- fprintf(stderr," Default = F\n\n");
- fprintf(stderr," -g Gap fraction to suppress conservation calculation [Real] Optional\n");
- fprintf(stderr," The value should be more than 0 and no more than 1. Conservation\n");
- fprintf(stderr," indices are calculated only for positions with gap fraction less\n");
- fprintf(stderr," than the specified value. Otherwise, conservation indices will\n");
- fprintf(stderr," be set to M-S, where M is the mean conservation value and S is\n");
- fprintf(stderr," the standard deviation.\n");
- fprintf(stderr," Default = 0.5\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -p Input pdb file [File in] Optional\n");
- fprintf(stderr," The sequence in the pdb file should match exactly the first sequence of\n");
- fprintf(stderr," the alignment.\n");
- fprintf(stderr,"\n");
- fprintf(stderr," -d Output pdb file [File Out] Optional\n");
- fprintf(stderr," The B-factors are replaced by the conservation indices.\n");
- fprintf(stderr," Default = STDOUT\n");
- fprintf(stderr,"\n");
- }
- void print_parameters(FILE *outfile,char *argi,char *argo,int nt,char *argt,int argb,char *args,int argm,int argf,int argc, int argw, char *argn,char *arga, char *arge, double argg, char *argp,char *argd)
- {
- FILE *fp;
- char *mt[]={
- "no transformation",
- "normalization S'(a,b)=S(a,b)/sqrt[S(a,a)*S(b,b)]",
- "adjustment S\"(a,b)=2*S(a,b)-(S(a,a)+S(b,b))/2",
- };
- char *ws[]={
- "unweighted",
- "weighted by the modified method of Henikoff & Henikoff",
- "independent-count based",
- };
- char *cm[]={
- "entropy-based",
- "variance-based",
- "sum-of-pairs measure",
- };
- fprintf(outfile, "\nal2co - The parameters are:\n\n");
- fprintf(outfile, "Input alignment file - %s\n", argi);
- if((fp=fopen(argo,"r"))==NULL){
- fprintf(outfile, "Output conservation - STDOUT\n");
- }
- else{ fprintf(outfile, "Output conservation file - %s\n", argo); fclose(fp);}
- if(nt==0){;}
- else if(nt==1) {
- fprintf(outfile, "Output alignment file with index - %s;", argt);
- fprintf(outfile, " Block size - %d\n",argb);
- }
- if((fp=fopen(args,"r"))==NULL){;}
- else {
- if(argc==2) {
- fprintf(outfile,"Input matrix file - %s\n",args);
- fprintf(outfile,"Matrix transformation - %s\n",mt[argm]);
- }
- else {
- fprintf(outfile,"Input matrix file - %s\n",args);
- if( (strcmp(arga,"T")==0) || (strcmp(arga, "t")==0) ) {;}
- else fprintf(outfile,"Warning - Matrix not used: matrix is for sum-of-pairs measure, -c 2\n"); }
- fclose(fp);
- }
- if( (strcmp(arga,"T")==0) || (strcmp(arga, "t")==0) ) {
- fprintf(outfile, "All 9 methods are used - true \n");
- fprintf(outfile, " 1. unweighted entropy measure; 2. Henikoff entropy measure;\n");
- fprintf(outfile, " 3. independent count entropy measure;\n");
- fprintf(outfile, " 4. unweighted variance measure; 5. Henikoff variance measure;\n");
- fprintf(outfile, " 6. independent count variance measure;\n");
- fprintf(outfile, " 7. unweighted matrix-based sum-of-pairs measure;\n");
- fprintf(outfile, " 8. Henikoff matrix-based sum-of-pairs measure;\n");
- fprintf(outfile, " 9. independent count matrix-based sum-of-pairs measure;\n");
- }
- else {
- fprintf(outfile, "Weighting scheme - %s\n", ws[argf]);
- fprintf(outfile, "Conservation calculation method - %s\n",cm[argc]);
- }
- if( (strcmp(arge, "T")==0) || (strcmp(arge, "t")==0 ) ) {
- fprintf(outfile, "The first sequence is not used in calculation\n");
- }
- fprintf(outfile, "Window size - %d\n", argw);
- if((strcmp(argn,"f")==0)||(strcmp(argn,"F")==0)){
- fprintf(outfile, "Conservation not normalized\n");}
- else fprintf(outfile, "Conservation normalized to zero mean and unity variance\n");
- fprintf(outfile, "Gap fraction to suppress calculation - %5.2f\n",argg);
- if((fp=fopen(argp,"r"))==NULL){;}
- else{
- fprintf(outfile,"Input pdb file - %s\n", argp);
- fclose(fp);
- if((fp=fopen(argd,"r"))==NULL){
- fprintf(outfile, "Output pdb file - STDOUT\n");
- }
- else { fprintf(outfile, "Output pdb file - %s\n", argd);
- fclose(fp);
- }
- }
- }
- #define NR_END 1
- #define FREE_ARG char*
- void nrerror(char error_text[]){
- fprintf(stderr,"%s\n",error_text);
- fprintf(stderr,"FATAL - execution terminated\n");
- exit(1);
- }
- char *cvector(long nl, long nh){
- char *v;
- v=(char *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(int)));
- if (!v) nrerror("allocation failure in ivector()");
- return v-nl+NR_END;
- }
- int *ivector(long nl, long nh){
- int *v;
- v=(int *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(int)));
- if (!v) nrerror("allocation failure in ivector()");
- return v-nl+NR_END;
- }
- long *lvector(long nl, long nh){
- long int *v;
- v=(long int *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(long int)));
- if (!v) nrerror("allocation failure in lvector()");
- return v-nl+NR_END;
- }
- double *dvector(long nl, long nh){
- double *v;
- v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
- if (!v) nrerror("allocation failure in dvector()");
- return v-nl+NR_END;
- }
- int **imatrix(long nrl, long nrh, long ncl, long nch){
- long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
- int **m;
- m=(int **)malloc((size_t)((nrow+NR_END)*sizeof(int*)));
- if (!m) nrerror("allocation failure 1 in imatrix()");
- m += NR_END;
- m -= nrl;
- m[nrl]=(int *)malloc((size_t)((nrow*ncol+NR_END)*sizeof(int)));
- if (!m[nrl]) nrerror("allocation failure 2 in imatrix()");
- m[nrl] += NR_END;
- m[nrl] -= ncl;
- for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
- return m;
- }
- double **dmatrix(long nrl, long nrh, long ncl, long nch){
- long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
- double **m;
- m=(double **)malloc((size_t)((nrow+NR_END)*sizeof(double*)));
- if (!m) nrerror("allocation failure 1 in dmatrix()");
- m += NR_END;
- m -= nrl;
- m[nrl]=(double *)malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
- if (!m[nrl]) nrerror("allocation failure 2 in dmatrix()");
- m[nrl] += NR_END;
- m[nrl] -= ncl;
- for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
- return m;
- }
- double ***d3tensor(long nrl,long nrh,long ncl,long nch,long ndl,long ndh){
- long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;
- double ***t;
- t=(double ***) malloc((size_t)((nrow+NR_END)*sizeof(double**)));
- if(!t)nrerror("allocation failure 1 in d3tensor()");
- t += NR_END;
- t -= nrl;
- t[nrl]=(double **) malloc((size_t)((nrow*ncol+NR_END)*sizeof(double*)));
- if(!t[nrl])nrerror("allocation failure 2 in d3tensor()");
- t[nrl] += NR_END;
- t[nrl] -= ncl;
- t[nrl][ncl]=(double *) malloc((size_t)((nrow*ncol*ndep+NR_END)*sizeof(double)));
- if(!t[nrl][ncl])nrerror("allocation failure 3 in d3tensor()");
- t[nrl][ncl] += NR_END;
- t[nrl][ncl] -= ndl;
- for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;
- for(i=nrl+1;i<=nrh;i++){
- t[i]=t[i-1]+ncol;
- t[i][ncl]=t[i-1][ncl]+ncol*ndep;
- for(j=ncl+1;j<=nch;j++)t[i][j]=t[i][j-1]+ndep;
- }
- return t;
- }
- int am2num_c(c)
- {
- switch (c) {
- case 'W':
- c=1; break;
- case 'w':
- c=26; break;
- case 'F':
- c=2; break;
- case 'f':
- c=27; break;
- case 'Y':
- c=3; break;
- case 'y':
- c=28; break;
- case 'M':
- c=4; break;
- case 'm':
- c=29; break;
- case 'L':
- c=5; break;
- case 'l':
- c=30; break;
- case 'I':
- c=6; break;
- case 'i':
- c=31; break;
- case 'V':
- c=7; break;
- case 'v':
- c=32; break;
- case 'A':
- c=8; break;
- case 'a':
- c=33; break;
- case 'C':
- c=9; break;
- case 'c':
- c=34; break;
- case 'G':
- c=10; break;
- case 'g':
- c=35; break;
- case 'P':
- c=11; break;
- case 'p':
- c=36; break;
- case 'T':
- c=12; break;
- case 't':
- c=37; break;
- case 'S':
- c=13; break;
- case 's':
- c=38; break;
- case 'N':
- c=14; break;
- case 'n':
- c=39; break;
- case 'Q':
- c=15; break;
- case 'q':
- c=40; break;
- case 'D':
- c=16; break;
- case 'd':
- c=41; break;
- case 'E':
- c=17; break;
- case 'e':
- c=42; break;
- case 'H':
- c=18; break;
- case 'h':
- c=43; break;
- case 'R':
- c=19; break;
- case 'r':
- c=44; break;
- case 'K':
- c=20; break;
- case 'k':
- c=45; break;
- default :
- c=0;
- }
- return (c);
- }
- int am2num(c)
- {
- switch (c) {
- case 'W': case 'w':
- c=1; break;
- case 'F': case 'f':
- c=2; break;
- case 'Y': case 'y':
- c=3; break;
- case 'M': case 'm':
- c=4; break;
- case 'L': case 'l':
- c=5; break;
- case 'I': case 'i':
- c=6; break;
- case 'V': case 'v':
- c=7; break;
- case 'A': case 'a':
- c=8; break;
- case 'C': case 'c':
- c=9; break;
- case 'G': case 'g':
- c=10; break;
- case 'P': case 'p':
- c=11; break;
- case 'T': case 't':
- c=12; break;
- case 'S': case 's':
- c=13; break;
- case 'N': case 'n':
- c=14; break;
- case 'Q': case 'q':
- c=15; break;
- case 'D': case 'd':
- c=16; break;
- case 'E': case 'e':
- c=17; break;
- case 'H': case 'h':
- c=18; break;
- case 'R': case 'r':
- c=19; break;
- case 'K': case 'k':
- c=20; break;
- default :
- c=0;
- }
- return (c);
- }
- int am2numBZX(c)
- {
- switch (c) {
- case 'W': case 'w':
- c=1; break;
- case 'F': case 'f':
- c=2; break;
- case 'Y': case 'y':
- c=3; break;
- case 'M': case 'm':
- c=4; break;
- case 'L': case 'l':
- c=5; break;
- case 'I': case 'i':
- c=6; break;
- case 'V': case 'v':
- c=7; break;
- case 'A': case 'a':
- c=8; break;
- case 'C': case 'c':
- c=9; break;
- case 'G': case 'g':
- c=10; break;
- case 'P': case 'p':
- c=11; break;
- case 'T': case 't':
- c=12; break;
- case 'S': case 's':
- c=13; break;
- case 'N': case 'n':
- c=14; break;
- case 'Q': case 'q':
- c=15; break;
- case 'D': case 'd':
- c=16; break;
- case 'E': case 'e':
- c=17; break;
- case 'H': case 'h':
- c=18; break;
- case 'R': case 'r':
- c=19; break;
- case 'K': case 'k':
- c=20; break;
- case 'B': case 'b':
- c=21; break;
- case 'Z': case 'z':
- c=22; break;
- case 'X': case 'x':
- c=23; break;
- case '*':
- c=24; break;
- default :
- c=0;
- }
- return (c);
- }
- static char str[MAXSTR+1];
- /*char **aname, **aseq;
- int nal, alilen, *astart, *alen;
- int **alignment;
- */
- static void *mymalloc(int size);
- char *strsave(char *str);
- char *strnsave(char *str, int l);
- static char **incbuf(int n, char **was);
- static int *incibuf(int n, int *was);
- void readali(char *filename);
- int **ali_char2int(char **aseq,int start_num, int start_seq);
- int **read_alignment2int(char *filename,int start_num,int start_seq);
- void counter(int b);
- double effective_number(int **ali, int *marks, int n, int start, int end);
- double effective_number_nogaps(int **ali, int *marks, int n, int start, int end);
- double effective_number_nogaps_expos(int **ali, int *marks, int n, int start, int end, int pos);
- static void *mymalloc(size)
- int size;
- {
- void *buf;
- if ((buf = malloc(size)) == NULL) {
- fprintf(stderr, "Not enough memory: %d\n", size);
- exit(1);
- }
- return buf;
- }
- char *strsave(str)
- char *str;
- {
- char *buf;
- int l;
- l = strlen(str);
- buf = mymalloc(l + 1);
- strcpy(buf, str);
- return buf;
- }
- char *strnsave(str, l)
- char *str;
- int l;
- {
- char *buf;
- buf = mymalloc(l + 1);
- memcpy(buf, str, l);
- buf[l] = '\0';
- return buf;
- }
- static char **incbuf(n, was)
- int n;
- char **was;
- {
- char **buf;
- char *aaa;
- buf = mymalloc((n+1) * sizeof(buf[0]));
- if (n > 0) {
- memcpy(buf, was, n * sizeof(was[0]));
- free(was);
- }
- buf[n] = NULL;
- return buf;
- }
- static int *incibuf(n, was)
- int n, *was;
- {
- int *ibuf;
- ibuf = mymalloc((n+1) * sizeof(ibuf[0]));
- if (n > 0) {
- memcpy(ibuf, was, n * sizeof(was[0]));
- free(was);
- }
- ibuf[n] = 0;
- return ibuf;
- }
- void err_readali(int err_num)
- {
- fprintf(stderr,"Error with reading alignment: %d\n",err_num);
- }
- void readali(filename)
- char *filename;
- {
- FILE *fp;
- char *s, *ss, *seqbuf;
- int n, l, len, len0;
- int ii,mark=1;
- if ((fp = fopen(filename, "r")) == NULL) {
- fprintf(stderr, "No such file: \"%s\"\n", filename);
- err_readali(1);
- ;exit(1);
- }
- alilen = 0;
- nal = 0;
- n = 0;
- if(fgets(str, MAXSTR, fp) != NULL) {
- if(strncmp(str,"CLUSTAL",7)!=0){rewind(fp);}
- }
- while (fgets(str, MAXSTR, fp) != NULL) {
- for (ss = str; isspace(*ss); ss++) ;
- if ((ii<=ss-str)&&(mark==0)) {continue;}
- if (*ss == '\0') {
- if (n == 0) {
- continue;
- }
- if (nal == 0) {
- if (n == 0) {
- fprintf(stderr, "No alignments read\n");
- err_readali(2);
- exit(1);
- }
- nal = n;
- } else if (n != nal) {
- fprintf(stderr, "Wrong nal, was: %d, now: %d\n", nal, n);
- err_readali(3); exit(1);
- }
- n = 0;
- continue;
- }
- for (s = ss; *s != '\0' && !isspace(*s); s++) ;
- *s++ = '\0';
- if (nal == 0) {
- astart = incibuf(n, astart);
- alen = incibuf(n, alen);
- aseq = incbuf(n, aseq);
- aname = incbuf(n, aname);
- aname[n] = strsave(ss);
- } else {
- if (n < 0 || n >= nal) {
- fprintf(stderr, "Bad sequence number: %d of %d\n", n, nal);
- err_readali(4); exit(1);
- }
- if (strcmp(ss, aname[n]) != 0) {
- fprintf(stderr, "Names do not match");
- fprintf(stderr, ", was: %s, now: %s\n", aname[n], ss);
- err_readali(5); exit(1);
- }
- }
- for (ss = s; isspace(*ss); ss++);
- if(mark==1){
- ii = ss-str;
- mark=0;}
- for (s = ss; isdigit(*s); s++) ;
- if (isspace(*s)) {
- if (nal == 0) {
- astart[n] = atoi(ss);
- }
- for (ss = s; isspace(*ss); ss++);
- }
- for (s = ss, l = 0; *s != '\0' && !isspace(*s); s++) {
- if (isalpha(*s)) {
- l++;
- }
- }
- len = s - ss;
- if (n == 0) {
- len0 = len;
- alilen += len;
- } else if (len != len0) {
- fprintf(stderr, "wrong len for %s", aname[n]);
- fprintf(stderr, ", was: %d, now: %d\n", len0, len);
- err_readali(6); exit(1);
- }
- alen[n] += l;
- if (aseq[n] == NULL) {
- aseq[n] = strnsave(ss, len);
- } else {
- seqbuf = mymalloc(alilen+1);
- memcpy(seqbuf, aseq[n], alilen-len);
- free(aseq[n]);
- aseq[n] = seqbuf;
- memcpy(seqbuf+alilen-len, ss, len);
- seqbuf[alilen] = '\0';
- }
- n++;
- }
- if (nal == 0) {
- if (n == 0) {
- fprintf(stderr, "No alignments read\n");
- err_readali(7); exit(1);
- }
- nal = n;
- } else if (n != 0 && n != nal) {
- fprintf(stderr, "Wrong nal, was: %d, now: %d\n", nal, n);
- err_readali(8); exit(1);
- }
- fclose(fp);
- }
- static void printali(char *argt, int chunk, int n, int len, char **name, char **seq, int *start,int *csv)
- {
- int i, j, k, jj, mlen, sta, *pos;
- char csv_str[100], arg_t[500];
- char *sq;
- int *isq;
- FILE *fpp;
- strcpy(arg_t,argt);
- fpp=fopen(arg_t,"w");
- strcpy(csv_str, "Conservation:");
- pos = mymalloc(n * sizeof(pos[0]));
- memcpy(pos, start, n * sizeof(pos[0]));
- for (i=0; i < n && start[i] == 0; i++) ;
- sta = (i < n);
- for (i=1, mlen=strlen(name[0]); i < n; i++) {
- if (mlen < strlen(name[i])) {
- mlen = strlen(name[i]);
- }
- }
- jj = 0;
- do {
- if (jj > 0) {
- fprintf(fpp, "\n");
- }
- for (i=0; i < n; i++) {
- if(i==0) {
- fprintf(fpp,"\n\n");
- for(k=0;k<mlen+4;k++){
- if(k<strlen(csv_str)){
- fprintf(fpp,"%c",csv_str[k]);}
- else fprintf(fpp," ");
- }
- if (sta) {
- fprintf(fpp,"%4d ", pos[i]);
- }
- isq = csv + jj;
- for (j=0; j+jj < len && j < chunk; j++) {
- if (isdigit(isq[j+1])) {
- pos[i]++;
- }
- fprintf(fpp,"%d", isq[j+1]);
- }
- if (sta) {
- fprintf(fpp, " %4d", pos[i]-1);
- }
- fprintf(fpp, "\n");
- }
- for(k=0;k<mlen+4;k++){
- if(k<strlen(name[i])){
- fprintf(fpp,"%c",name[i][k]);}
- else fprintf(fpp," ");
- }
- if (sta) {
- fprintf(fpp, "%4d ", pos[i]);
- }
- sq = seq[i] + jj;
- for (j=0; j+jj < len && j < chunk; j++) {
- if (isalpha(sq[j])) {
- pos[i]++;
- }
- fprintf(fpp, "%c", sq[j]);
- }
- if (sta) {
- fprintf(fpp, " %4d", pos[i]-1);
- }
- fprintf(fpp, "\n");
- }
- jj += j;
- } while (jj < len);
- free(pos);
- fclose(fpp);
- }
- int **ali_char2int(char **aseq, int start_num, int start_seq){
- /* fills the alignment ali[start_num..start_num+nal-1][start_seq..start_seq+alilen-1]
- convetring charater to integer from aseq[0..nal-1][0..alilen-1]
- */
- int i,j,end_num,end_seq;
- int **ali;
- end_num=start_num+nal-1;
- end_seq=start_seq+alilen-1;
- ali=imatrix(start_num,end_num,start_seq,end_seq);
- for(i=start_num;i<=end_num;++i)for(j=start_seq;j<=end_seq;++j)ali[i][j]=am2num(aseq[i-start_num][j-start_seq]);
- return ali;
- }
- int **read_alignment2int(char *filename,int start_num,int start_seq){
- int **ali;
- readali(filename);
- ali=ali_char2int(aseq,start_num,start_seq);
- return ali;
- }
- double effective_number(int **ali, int *marks, int n, int start, int end){
- /* from the alignment of n sequences ali[1..n][1..l]
- calculates effective number of sequences that are marked by 1 in mark[1..n]
- for the segment of positions ali[][start..end]
- Neff=ln(1-0.05*N-of-different-letters-per-site)/ln(0.95)
- */
- int i,k,a,flag,ngc;
- int *amco,lettercount=0,sitecount=0,gsitecount=0;
- int *nogaps;
- double letpersite=0,neff,randomcount=0.0;
- amco=ivector(0,20);
- nogaps=ivector(0,n);
- for(i=0;i<=n;++i)nogaps[i]=0;
- for(k=start;k<=end;++k){
- for(a=0;a<=20;++a)amco[a]=0;
- for(i=1;i<=n;++i)if(marks[i]==1)amco[ali[i][k]]++;
- flag=0;for(a=1;a<=20;++a)if(amco[a]>0){flag=1;lettercount++;}
- if(flag==1)sitecount++;
- }
- letpersite=1.0*lettercount/sitecount;
- for(k=start;k<=end;++k){
- ngc=0;for(i=1;i<=n;++i)if(marks[i]==1)if(ali[i][k]!=0)ngc++;
- if(ngc!=0)gsitecount++;
- nogaps[ngc]++;
- }
- for(i=0;i<=n;++i)fprintf(stderr,"%3d %4d\n",i,nogaps[i]);
- if(gsitecount!=sitecount){fprintf(stderr,"Counts didn't match in \"effective_number\": FATAL\n");exit(0);}
- for(i=1;i<=n;++i)randomcount+=nogaps[i]*20.0*(1.0-exp(-0.05129329438755*i));
- randomcount=randomcount/gsitecount;
- letpersite=letpersite*20.0*(1.0-exp(-0.05129329438755*n))/randomcount;
- neff=-log(1.0-0.05*letpersite)/0.05129329438755;
- return neff;
- }
- double effective_number_nogaps(int **ali, int *marks, int n, int start, int end){
- /* from the alignment of n sequences ali[1..n][1..l]
- calculates effective number of sequences that are marked by 1 in mark[1..n]
- for the segment of positions ali[][start..end]
- Neff=ln(1-0.05*N-of-different-letters-per-site)/ln(0.95)
- */
- int i,k,a,flag;
- int *amco,lettercount=0,sitecount=0;
- double letpersite=0,neff;
- amco=ivector(0,20);
- for(k=start;k<=end;++k){
- flag=0;for(i=1;i<=n;++i)if(marks[i]==1 && ali[i][k]==0)flag=1;
- if(flag==1)continue;
- for(a=0;a<=20;++a)amco[a]=0;
- for(i=1;i<=n;++i)if(marks[i]==1)amco[ali[i][k]]++;
- flag=0;for(a=1;a<=20;++a)if(amco[a]>0){flag=1;lettercount++;}
- if(flag==1)sitecount++;
- }
- if(sitecount==0)letpersite=0;
- else letpersite=1.0*lettercount/sitecount;
- neff=-log(1.0-0.05*letpersite)/0.05129329438755;
- return neff;
- }
- int *letters;
- void **freq(int **ali,double **f,int *num_gaps,int *effindiarr,double gap_threshold)
- {
- int i,j,k,effnumind;
- int count[21];
- letters = ivector(0, alilen+1);
- letters[0]=1;
- /* find the number of frequences at each position */
- effnumind=0;
- for(j=1;j<=alilen;j++){
- for(i=0;i<=20;i++) count[i]=0;
- for(i=1;i<=nal;i++) {
- if(ali[i][j]<=20) {count[ali[i][j]]++;}
- else {if(ali[i][j]>25&&ali[i][j]<=45)
- {count[ali[i][j]-25]++;}
- else {
- fprintf(stderr,"not good number for AA\n");
- exit(0);
- }
- }
- }
- num_gaps[j] = count[0];
- f[0][j] = count[0]*1.0/nal;
- if(f[0][j]>1) {
- fprintf(stderr,"gap number>total number\n");
- exit(0);
- }
- if(f[0][j]>=gap_threshold) { /* ignore the case where gaps occur >= gap_threshold(percentage of gaps) */
- f[0][j]=INDI;
- continue;
- }
- effnumind++;
- effindiarr[effnumind]=j;
- count[0]=nal-count[0];
- letters[j] = count[0];
- if(count[0]<=0){
- fprintf(stderr, "count[0] less than 0: %d\n",count[0]);
- exit(0);
- }
- for(k=1;k<=20;k++){
- f[k][j]=count[k]*1.0/count[0];
- }
- }
- effindiarr[effnumind+1]=INDI;/*set the last element negative*/
- effindiarr[0]=effnumind;
- }
- /* calculating the standard error of frequencies */
- double **sigmaf(double **f)
- {
- double **sigma;
- int i,j;
- sigma = dmatrix(0,20,0,alilen);
- if(letters[0]!=1) {
- fprintf(stderr, "letters not defined previously\n");
- exit(0);
- }
- for(j=1;j<=alilen;j++) {
- if(letters[j]==0) {
- fprintf(stderr,"The column contains no letters:%d\n",j);
- for(i=1;i<=20;i++){sigma[i][j]=0;}
- continue;
- }
- for(i=1;i<=20;i++) {
- if(f[i][j]==0) {sigma[i][j]=0;continue;}
- sigma[i][j]=sqrt(f[i][j]*(1-f[i][j])/letters[j]);
- }
- }
- return sigma;
- }
- int totalw;
- double *oaf_ip, *h_oaf_ip; /*unweighted,henikoff frequency at position ip */
- double *overall_freq(int **ali, int startp, int endp, int *mark);
- double *overall_freq_wgt(int **ali,int startp,int endp,int *mark,double *wgt);
- double *h_weight(int **ali, int ip)
- {
- double *hwt;
- int count[21];
- int amtypes;
- int i,j,k;
- int maxstart, maxs, miniend,minie;
- int *mark;
- int gapcount,totalcount,wsign;
- hwt = dvector(0, nal);
- mark = ivector(0, nal);
- oaf_ip = dvector(0,20);
- h_oaf_ip = dvector(0,20);
- for(i=1;i<=20;i++) h_oaf_ip[i] = 0;
- /* mark the sequences with gaps */
- for(i=1;i<=nal;i++) {
- // NEW: fix a typo 05/06/03
- if((ali[i][ip]>0&&ali[i][ip]<=20)||(ali[i][ip]>25&&ali[i][ip]<=45)) mark[i]=1;
- else if (ali[i][ip]==0)mark[i]=0;
- }
- /* find the maxstart and miniend positions */
- maxstart = 1;
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- maxs = 1;
- for(j=1;j<=alilen;j++) {
- if(ali[i][j]==0) maxs++;
- if(ali[i][j]>0) break;
- }
- if(maxstart<maxs) maxstart = maxs;
- }
- miniend = alilen;
- for(i=1;i<=nal;i++){
- if(!mark[i]) continue;
- minie = alilen;
- for(j=alilen;j>0;j--) {
- if(ali[i][j]==0) minie--;
- if(ali[i][j]>0) break;
- }
- if(miniend>minie) miniend = minie;
- }
- /* NEW: 05/06/03 */
- if(maxstart > miniend) maxstart = miniend;
- /* special case where only one site is not gap */
- j=0;
- for(i=1;i<=nal;i++) {if(mark[i]>0) j++;}
- if(j==1) {
- for(i=1;i<=nal;i++) hwt[i]=mark[i];
- h_oaf_ip = overall_freq_wgt(ali,maxstart,miniend,mark,hwt);
- oaf_ip = overall_freq(ali,1,alilen,mark);
- return hwt;
- }
- for(i=1;i<=nal;i++) hwt[i]=0;
- for(j=maxstart;j<=miniend;j++){
- amtypes = 0;
- for(i=0;i<=20;i++) count[i]=0;
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- if(ali[i][j]>=0&&ali[i][j]<=20) count[ali[i][j]]++;
- else if(ali[i][j]>25&&ali[i][j]<=45) count[ali[i][j]-25]++;
- }
- for(i=0;i<=20;i++) {
- if(count[i]>0) amtypes++;
- }
- if(amtypes==1) continue; /* identical positions are excluded */
- gapcount=totalcount=0;
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- if(ali[i][j]==0) gapcount++;
- totalcount++;
- }
- if(gapcount>totalcount*0.5) continue;/*gap>50% excluded*/
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- if(ali[i][j]>=0&&ali[i][j]<=20) {
- if(count[ali[i][j]]>0) {
- hwt[i]+=1.0/(count[ali[i][j]]*amtypes);
- }
- }
- if(ali[i][j]>25&&ali[i][j]<=45) {
- if(count[ali[i][j]-25]>0) {
- hwt[i]+=1.0/(count[ali[i][j]-25]*amtypes);
- }
- }
- }
- }
- /* test if all henikoff weights are zero for all sequences */
- wsign=0;
- for(i=1;i<=nal;i++) {
- if(hwt[i]>0) { wsign=1;break;}
- }
- gapcount=0;
- if(wsign==0) {
- for(i=1;i<=nal;i++){
- if(ali[i][ip]==0) gapcount++;
- }
- if(gapcount==nal){fprintf(stderr, "This position contains only gaps\n");exit(0);}
- for(i=1;i<=nal;i++){
- if(ali[i][ip]!=0) hwt[i]=1.0/(nal-gapcount);
- }
- }
- h_oaf_ip = overall_freq_wgt(ali,maxstart,miniend,mark,hwt);
- oaf_ip = overall_freq(ali,1,alilen,mark);
- return hwt;
- }
- double **u_oaf,**h_oaf; /* unweighted and henikoff overall frequency */
- double **h_freq(int **ali, double **f, double **hfr)
- {
- int i,j;
- double *hwt;
- double sumofweight;
- u_oaf = dmatrix(0,20,0,alilen);
- h_oaf = dmatrix(0,20,0,alilen);
- for(j=1;j<=alilen;j++) {
- if(f[0][j]==INDI) {hfr[0][j]=INDI;continue;}
- hwt = h_weight(ali, j); /* assign position specific weight */
- for(i=1;i<=20;i++) {
- u_oaf[i][j]=oaf_ip[i];
- h_oaf[i][j]=h_oaf_ip[i];
- }
- sumofweight = 0;
- for(i=1;i<=nal;i++) {
- if(ali[i][j]>0) sumofweight+=hwt[i];
- }
- for(i=1;i<=nal;i++) {
- if(ali[i][j]>0&&ali[i][j]<=20) {
- hfr[ali[i][j]][j]+=hwt[i]/sumofweight;
- }
- else if(ali[i][j]>25&&ali[i][j]<=45) {
- hfr[ali[i][j]-25][j]+=hwt[i]/sumofweight;
- }
- }
- }
- }
- double *entro_conv(double **f, int **ali, double *econv)
- {
- int i,j;
- for (j=1;j<=alilen;j++){
- econv[j]=0;
- if(f[0][j]==INDI) {econv[j]=INDI;continue;}
- for(i=1;i<=20;i++) {
- if(f[i][j]==0) continue;
- econv[j]+=f[i][j]*log(f[i][j]);
- }
- }
- }
- double *overall_freq(int **ali, int startp, int endp, int *mark)
- {
- double *oaf;
- int gapcount,totalcount;
- int total=0;
- int i,j;
- oaf = dvector(0,20);
- for(i=1;i<=20;i++) oaf[i]=0;
- if(startp>endp) {
- fprintf(stderr, "start position larger than ending position\n");
- exit(1);
- }
- for(j=startp;j<=endp;j++) {
- /* excluding those that have >50% gaps*/
- gapcount=totalcount=0;
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- totalcount++;
- if(ali[i][j]==0) gapcount++;
- }
- if(gapcount>totalcount*0.5) continue;
- for(i=1;i<=nal;i++) {
- if(mark[i]==0) continue;
- if(ali[i][j]>0&&ali[i][j]<=20) {
- oaf[ali[i][j]]+=1;
- total++;
- }
- if(ali[i][j]>25&&ali[i][j]<=45) {
- oaf[ali[i][j]-25]+=1;
- total++;
- }
- }
- }
- for(i=1;i<=20;i++) {
- oaf[i]=oaf[i]/total;
- }
- return oaf;
- }
- double *overall_freq_wgt(int **ali,int startp,int endp,int *mark,double *wgt)
- {
- double *oaf;
- double total=0;
- int totalcount,gapcount;
- int i,j;
- oaf = dvector(0,20);
- for(i=1;i<=20;i++) oaf[i]=0;
- if(startp>endp) {
- fprintf(stderr, "start position larger than ending position\n"); exit(1);
- }
- for(j=startp;j<=endp;j++) {
- /* excluding those that have >50% gaps*/
- gapcount=totalcount=0;
- for(i=1;i<=nal;i++){
- if(mark[i]==0) continue;
- totalcount++;
- if(ali[i][j]==0) gapcount++;
- }
- if(gapcount>totalcount*0.5) continue;
- for(i=1;i<=nal;i++) {
- if(mark[i]==0) continue;
- if(ali[i][j]>0&&ali[i][j]<=20) {
- oaf[ali[i][j]]+=wgt[i];
- total+=wgt[i];
- }
- if(ali[i][j]>25&&ali[i][j]<=45) {
- oaf[ali[i][j]-25]+=wgt[i];
- total+=wgt[i];
- }
- }
- if(total==0) fprintf(stderr,"total=0\n");
- }
- for(i=1;i<=20;i++) {
- oaf[i]=oaf[i]/total;
- }
- return oaf;
- }
- double **ic_freq(int **ali, double **f, double **icf)
- {
- int i,j,k;
- int ele;
- double *effnu;
- int *mark;
- mark = ivector(0,nal+10);
- effnu = dvector(0,20);
- for(i=0;i<=20;i++)
- for(j=1;j<=alilen;j++)
- icf[i][j]=0;
- for(i=0;i<=nal;i++) mark[i]=0;
- for(j=1;j<=alilen;j++){
- if(f[0][j]==INDI) {icf[0][j]=INDI;continue;}
- for(k=0;k<=20;++k)effnu[k]=0;
- for(k=1;k<=20;++k){
- for(i=1;i<=nal;++i){
- mark[i]=0;
- ele=ali[i][j];
- if(ele==k)mark[i]=1;
- ele=ali[i][j]-25;
- if(ele==k) mark[i]=1;
- }
- effnu[k]=effective_number_nogaps(ali,mark,nal,1,alilen);
- effnu[0]+=effnu[k];
- }
- if(effnu[0]==0){fprintf(stderr,"all counts are zeros at the column %d: FATAL\n",j);exit(0);}
- for(k=1;k<=20;k++) effnu[k]=effnu[k]/effnu[0];
- for(k=1;k<=20;k++) icf[k][j] = effnu[k];
- }
- }
- double *variance_conv(double **f, int **ali, double **oaf, double *vconv)
- {
- int i,j;
- for(i=1;i<=alilen;i++) vconv[i]=0;
- for(j=1;j<=alilen;j++) {
- if(f[0][j]==INDI) {vconv[j]=INDI;continue;}
- for(i=1;i<=20;i++) {
- vconv[j]+=(f[i][j]-oaf[i][j])*(f[i][j]-oaf[i][j]);
- }
- vconv[j]=sqrt(vconv[j]);
- }
- }
- double *pairs_conv(double **f,int **ali,int **matrix1,int indx,double *pconv)
- {
- int i,j,k;
- double **matrix2, **matrix3;
- FILE *fp;
- matrix2 = dmatrix(0,25,0,25);
- matrix3 = dmatrix(0,25,0,25);
- for(i=0;i<=25;i++){
- for(j=0;j<=25;j++) {
- matrix2[i][j]=0;
- matrix3[i][j]=0;
- }
- }
- /* get the matrices */
- for(i=1;i<=24;i++){
- if(matrix1[i][i]==0) {
- fprintf(stderr, "diagonal elements zero: %d\n",i);
- exit(0);
- }
- for(j=1;j<=24;j++) {
- matrix2[i][j]=matrix1[i][j]*1.0/sqrt(matrix1[i][i]*matrix1[j][j]);
- matrix3[i][j]=matrix1[i][j]*2-0.5*(matrix1[i][i]+matrix1[j][j]);
- }
- }
- for(j=1;j<=alilen;j++) {
- if(f[0][j]==INDI){pconv[j]=INDI; continue;}
- pconv[j]=0;
- for(i=1;i<=20;i++) {
- for(k=1;k<=20;k++) {
- if(indx==0){
- pconv[j]+=(f[k][j]*f[i][j])*matrix1[i][k];
- continue;}
- if(indx==1){
- pconv[j]+=(f[k][j]*f[i][j])*matrix2[i][k];
- continue;}
- if(indx==2){
- pconv[j]+=(f[k][j]*f[i][j])*matrix3[i][k];
- continue;}
- fprintf(stderr,"not good index number of matrix\n");
- exit(0);
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement