layer0/Match.cpp (510 lines of code) (raw):

/* A* ------------------------------------------------------------------- B* This file contains source code for the PyMOL computer program C* Copyright (c) Schrodinger, LLC. D* ------------------------------------------------------------------- E* It is unlawful to modify or remove this copyright notice. F* ------------------------------------------------------------------- G* Please see the accompanying LICENSE file for further information. H* ------------------------------------------------------------------- I* Additional authors of this source file include: -* -* -* Z* ------------------------------------------------------------------- */ #include"os_predef.h" #include"os_limits.h" #include"os_std.h" #include"Base.h" #include"MemoryDebug.h" #include"OOMac.h" #include"Match.h" #include"Util.h" #include"Feedback.h" #include"Parse.h" #include"File.h" #ifndef int2 typedef int int2[2]; #endif CMatch *MatchNew(PyMOLGlobals * G, unsigned int na, unsigned int nb, int dist_mats) { unsigned int dim[2]; OOCalloc(G, CMatch); I->na = na; I->nb = nb; dim[0] = na; dim[1] = nb; I->G = G; if(na && nb) { I->mat = (float **) UtilArrayCalloc(dim, 2, sizeof(float)); } if(dist_mats && na) { dim[0] = na + 1; dim[1] = na + 1; I->da = (float **) UtilArrayCalloc(dim, 2, sizeof(float)); } if(dist_mats && nb) { dim[0] = nb + 1; dim[1] = nb + 1; I->db = (float **) UtilArrayCalloc(dim, 2, sizeof(float)); } /* scoring matrix is always 128^2 */ dim[0] = dim[1] = 128; I->smat = (float **) UtilArrayCalloc(dim, 2, sizeof(float)); { /* lay down an 10/-1 identity matrix to cover matches for known residues other than amino acids (dna, rna, as 1,2,3,4 etc.) these values will be overwritten by the matrix */ unsigned int i,j; for(i=0;i<dim[0];i++) { for(j=0;j<dim[1];j++) { I->smat[i][j] = -1.0F; } } for(i=0;i<dim[0];i++) { I->smat[i][i] = 10.0F; /* these values will be overwritten by BLOSUM, etc. */ } // water (was mapped to 'X' in previous versions) I->smat['O']['O'] = -1.0F; } if(!(I->mat && I->smat && ((!dist_mats) || (I->da && I->db)))) { MatchFree(I); I = NULL; } return (I); } int MatchResidueToCode(CMatch * I, int *vla, int n) { int ok = true; int a, b, c; int found; int *trg; char res[][4] = { "HOH", "O", /* water */ /* using numbers to prevent nucleic acids from getting confounded with protein scores */ "A", "1", "DA", "1", "ADE", "1", "C", "2", "DC", "2", "CYT", "2", "G", "3", "DG", "3", "GUA", "3", "T", "4", "DT", "4", "THY", "4", "U", "4", "URA", "4", /* these should correspond to the matrix being read */ "ALA", "A", "CYS", "C", "ASP", "D", "GLU", "E", "PHE", "F", "GLY", "G", "HIS", "H", "ILE", "I", "LYS", "K", "LEU", "L", "MET", "M", "MSE", "M", /* selenomet */ "ASN", "N", "PRO", "P", "GLN", "Q", "ARG", "R", "SER", "S", "SEP", "S", /* phosphoserine */ "THR", "T", "TPO", "T", /* phosphothreonine */ "VAL", "V", "TRP", "W", "TYR", "Y", "PTR", "Y", /* phosphotyrosine */ "TYS", "Y" /* sulfotyrosine */ }; const int cNRES = (sizeof res) / 8; int rcode[cNRES], rname[cNRES]; /* get integral values for the residue names */ for(a = 0; a < cNRES; a++) { b = 0; for(c = 0; c < 3; c++) { b = (b << 8) | res[a * 2][c]; } rname[a] = b; rcode[a] = res[a * 2 + 1][0]; } for(b = 0; b < n; b++) { found = 0; trg = vla + (b * 3) + 2; for(a = 0; a < cNRES; a++) if(rname[a] == *trg) { found = true; *trg = rcode[a]; break; } if(!found) { // codes for unknown residues are three byte (mask 0xFFFFFF80) *trg = *trg << 8; } } return (ok); } int MatchPreScore(CMatch * I, int *vla1, int n1, int *vla2, int n2, int quiet) { PyMOLGlobals *G = I->G; int a, b; if(!quiet) { PRINTFB(G, FB_Match, FB_Details) " Match: assigning %d x %d pairwise scores.\n", n1, n2 ENDFB(G); } for(a = 0; a < n1; a++) { for(b = 0; b < n2; b++) { // codes for known residues are one byte (mask 0x0000007F) // codes for unknown residues are three byte (mask 0xFFFFFF80) // This allows for exact match of unknown three-letter codes. // Fallback for unknown residues with no exact match is 'X' int code1 = vla1[a * 3 + 2]; int code2 = vla2[b * 3 + 2]; if (code1 & 0xFFFFFF80) { if (code1 == code2) { I->mat[a][b] = 5.F; // (was -1 in previous versions) continue; } code1 = 'X'; } if (code2 & 0xFFFFFF80) { code2 = 'X'; } I->mat[a][b] = I->smat[code1][code2]; } } return 1; } #define BLOSUM62_ROWS 33 #define BLOSUM62_COLS 80 static char blosum62[BLOSUM62_ROWS][BLOSUM62_COLS] = { "# Matrix made by matblas from blosum62.iij\n", "# * column uses minimum score\n", "# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\n", "# Blocks Database = /data/blocks_5.0/blocks.dat\n", "# Cluster Percentage: >= 62\n", "# Entropy = 0.6979, Expected = -0.5209\n", " A R N D C Q E G H I L K M F P S T W Y V B Z X *\n", "A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4\n", "R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4\n", "N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4\n", "D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4\n", "C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4\n", "Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4\n", "E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4\n", "G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4\n", "H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4\n", "I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4\n", "L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4\n", "K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4\n", "M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4\n", "F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4\n", "P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4\n", "S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4\n", "T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4\n", "W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4\n", "Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4\n", "V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4\n", "B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4\n", "Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4\n", "X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4\n", "* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1\n", "" }; int MatchMatrixFromFile(CMatch * I, const char *fname, int quiet) { PyMOLGlobals *G = I->G; int ok = 1; char *buffer = NULL; char *p; char cc[255]; char *code = NULL; unsigned int x, y; int a; int n_entry; if(fname && fname[0] #ifdef _PYMOL_NOPY /* if Python is absent, then use the hardcoded copy of BLOSUM62 */ && !(strcmp(fname, "BLOSUM62") == 0) #endif ) { buffer = FileGetContents(fname, NULL); if (!buffer) { PRINTFB(G, FB_Match, FB_Errors) " Match-Error: unable to open matrix file '%s'.\n", fname ENDFB(G); ok = false; } } else { buffer = Alloc(char, BLOSUM62_ROWS * BLOSUM62_COLS); if(buffer) { p = buffer; a = 0; while(blosum62[a][0]) { strcpy(p, &blosum62[a][0]); p += strlen(p); a++; } } else { ok = false; } } if(ok && buffer) { /* count codes */ p = buffer; n_entry = 0; while(*p && ok) { switch (*p) { case '#': break; default: if((*p) > 32) n_entry++; break; } p = ParseNextLine(p); } if(!n_entry) ok = false; else { code = (char *) Calloc(char, n_entry * sizeof(int)); /* read codes */ p = buffer; n_entry = 0; while(*p && ok) { switch (*p) { case '#': break; default: if((*p) > 32) { code[n_entry] = *p; n_entry++; } break; } p = ParseNextLine(p); } /* read values */ p = buffer; while((*p) && ok) { switch (*p) { case '#': break; default: if((*p) > 32) { x = *(p++); for(a = 0; a < n_entry; a++) { p = ParseWordCopy(cc, p, 255); y = (unsigned int) code[a]; ok = sscanf(cc, "%f", &I->smat[x][y]); } } break; } if(!ok) break; p = ParseNextLine(p); } } mfree(buffer); } if(ok) { if(!quiet) { PRINTFB(G, FB_Match, FB_Details) " Match: read scoring matrix.\n" ENDFB(G); } } FreeP(code); return (ok); } int MatchAlign(CMatch * I, float gap_penalty, float ext_penalty, int max_gap, int max_skip, int quiet, int window, float ante) { PyMOLGlobals *G = I->G; int a, b, f, g; int nf, ng; int sf, sg; float **score; float **da, **db; int na = I->na, nb = I->nb; unsigned int dim[2]; int2 **point; float mxv; int mxa, mxb; float tst = 0.0; int gap = 0; int *p; int cnt; int ok = true; const float MIN_SCORE = 0.0F; nf = na + 1; ng = nb + 1; da = I->da; db = I->db; if(!quiet) { PRINTFB(G, FB_Match, FB_Actions) " MatchAlign: aligning residues (%d vs %d)...\n", na, nb ENDFB(G); } dim[0] = nf; dim[1] = ng; VLAFreeP(I->pair); score = (float **) UtilArrayCalloc(dim, 2, sizeof(float)); point = (int2 **) UtilArrayCalloc(dim, 2, sizeof(int2)); if(score && point) { /* initialize the scoring matrix */ for(f = 0; f < nf; f++) { for(g = 0; g < ng; g++) { score[f][g] = MIN_SCORE; point[f][g][0] = -1; point[f][g][1] = -1; } } /* now start walking backwards up the alignment */ { int second_pass = false; for(b = nb - 1; b >= 0; b--) { for(a = na - 1; a >= 0; a--) { /* find the maximum scoring cell accessible from this position, * while taking gap penalties into account */ mxv = MIN_SCORE; mxa = -1; mxb = -1; /* search for asymmetric insertions and deletions */ f = a + 1; if((max_gap >= 0) && (second_pass)) { sf = a + 2 + max_gap; sg = b + 2 + max_gap; if(sg > ng) sg = ng; if(sf > nf) sf = nf; } else { sg = ng; sf = nf; } for(g = b + 1; g < sg; g++) { tst = score[f][g]; if(window) { int aa = a, bb = b, ff = f, gg = g, cc; tst += ante; for(cc = 0; cc < window; cc++) { if((ff >= 0) && (gg >= 0) && (ff < na) && (gg < nb)) { tst -= (float) fabs(da[a][ff] - db[b][gg]); aa = ff; bb = gg; ff = point[aa][bb][0]; gg = point[aa][bb][1]; } else break; } } if(!((f == na) || (g == nb))) { gap = g - (b + 1); if(gap) tst += gap_penalty + ext_penalty * (gap - 1); } if(tst > mxv) { mxv = tst; mxa = f; mxb = g; } } g = b + 1; for(f = a + 1; f < sf; f++) { tst = score[f][g]; if(window) { int aa = a, bb = b, ff = f, gg = g, cc; tst += ante; for(cc = 0; cc < window; cc++) { if((ff >= 0) && (gg >= 0) && (ff < na) && (gg < nb)) { tst -= (float) fabs(da[a][ff] - db[b][gg]); aa = ff; bb = gg; ff = point[aa][bb][0]; gg = point[aa][bb][1]; } else break; } } if(!((f == na) || (g == nb))) { gap = (f - (a + 1)); if(gap) tst += gap_penalty + ext_penalty * (gap - 1); } if(tst > mxv) { mxv = tst; mxa = f; mxb = g; } } if(max_skip) { /* search for high scoring mismatched stretches */ sf = a + 1 + max_skip; sg = b + 1 + max_skip; if(sf > nf) sf = nf; if(sg > ng) sg = ng; for(f = a + 1; f < sf; f++) { for(g = b + 1; g < sg; g++) { tst = score[f][g]; if(window) { int aa = a, bb = b, ff = f, gg = g, cc; tst += ante; for(cc = 0; cc < window; cc++) { if((ff >= 0) && (gg >= 0) && (ff < na) && (gg < nb)) { tst -= (float) fabs(da[a][ff] - db[b][gg]); aa = ff; bb = gg; ff = point[aa][bb][0]; gg = point[aa][bb][1]; } else break; } } /* only penalize if we are not at the end */ if(!((f == na) || (g == nb))) { gap = ((f - (a + 1)) + (g - (b + 1))); if(gap > 1) tst += 2 * gap_penalty + ext_penalty * (gap - 2); } } if(tst > mxv) { mxv = tst; mxa = f; mxb = g; } } } /* store what the best next step is */ point[a][b][0] = mxa; point[a][b][1] = mxb; /* and store the cumulative score for this cell */ score[a][b] = mxv + I->mat[a][b]; second_pass = true; } } } if(Feedback(G, FB_Match, FB_Debugging)) { for(b = 0; b < nb; b++) { for(a = 0; a < na; a++) { printf("%4.1f(%2d,%2d)", score[a][b], point[a][b][0], point[a][b][1]); } printf("\n"); } } /* find the best entry point */ mxv = MIN_SCORE; mxa = 0; mxb = 0; for(b = 0; b < nb; b++) { for(a = 0; a < na; a++) { tst = score[a][b]; if(tst > mxv) { mxv = tst; mxa = a; mxb = b; } } } I->pair = VLAlloc(int, 2 * (na > nb ? na : nb)); p = I->pair; a = mxa; b = mxb; cnt = 0; while((a >= 0) && (b >= 0) && (a < na) && (b < nb)) { *(p++) = a; *(p++) = b; f = point[a][b][0]; g = point[a][b][1]; a = f; b = g; cnt++; } PRINTFD(G, FB_Match) " MatchAlign-DEBUG: best entry %8.3f %d %d %d\n", mxv, mxa, mxb, cnt ENDFD; if(!quiet) { PRINTFB(G, FB_Match, FB_Results) " MatchAlign: score %1.3f\n", mxv ENDFB(G); } I->score = mxv; I->n_pair = cnt; VLASize(I->pair, int, (p - I->pair)); FreeP(score); FreeP(point); } return (ok); } void MatchFree(CMatch * I) { FreeP(I->da); FreeP(I->db); FreeP(I->mat); FreeP(I->smat); VLAFreeP(I->pair); OOFreeP(I); }