#include #include #include #include #include #include // Primitive PMF search engine // Copyright (C) 2005, 2006 Jacques Colinge // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. // Contact: // Dr. Jacques Colinge // CeMM // Lazarettgasse 19/3 // A-1090 Vienna, Austria // http://www.cemm.at // Simple C++ code to implement the PMF DB search algorithm. Some elementary classes are defined to // collect and organize data. No real OOP! Compiled with g++ and tested under Linux. // No variable modifications are possible, cysteines are modified. Tryptic digestion only. // Exercises: (1) Implements Profound (Zhang & Chait, Anal Chem, 2000; Zhang, et al., International Patent Application WO 00/73787 A1), RSA (Egelhofer, et al., Anal Chem, 2002), or OLAV-PMF (Magnin, et al., J Proteome Res, 2004) scores; // (2) Improve the p-value estimation; // (3) Implement variable oxidation on methionines; // (4) Implement a simple user interface (web-based). using namespace std; int verbose = 0; enum {noScore=-300, nBest=10}; static const double reductionFactor = 0.25; // Class used to represent peptides after digestion. class PeptidePosition { public: PeptidePosition():first(0),last(0),length(0),nmc(0) {} PeptidePosition(unsigned f, unsigned l, unsigned len, unsigned n){ first = f; last = l; length = len; nmc = n; } ~PeptidePosition(){} unsigned first, last, length, nmc; }; // class PeptidePosition // Class used to represent one peak, either experimental or theoretical class Peak { public: Peak():mass(0.0),intensity(0.0) {} Peak(double m, double i){ mass = m; intensity = i; } ~Peak(){} double mass, intensity; }; // class Peak bool operator<(const Peak& a, const Peak& b) { return a.mass < b.mass; } // operator<(const Peak&, const Peak&) // Class used to represent the match between one experimental peak and one theoretical peak class MatchedPeaks { public: MatchedPeaks(){ theoPeak = Peak(); expPeak = Peak(); } MatchedPeaks(const Peak& t, const Peak& e){ theoPeak = t; expPeak = e; } ~MatchedPeaks(){} Peak theoPeak, expPeak; }; // class MatchedPeaks // Class used to represent the match of the experimental spectrum with one database entry class DBMatch { public: DBMatch(){ ac = id = ""; score = proteinMass = 0.0; pValue = 1.0; nbPept = nbTheoPept = 0; } DBMatch(string a, string i, double s, unsigned n, double pm, unsigned nt){ ac = a; score = s; pValue = 1.0; id = i; nbPept = n; proteinMass = pm; nbTheoPept = nt; } ~DBMatch(){} double score, pValue, proteinMass; string ac, id; unsigned nbPept, nbTheoPept; }; // class DBMatch bool operator<(const DBMatch& a, const DBMatch& b) { return a.score < b.score; } // operator<(const DBMatch&, const DBMatch&) // Global variables to store basic masses and the MOWSE matrix vector aaMasses; double waterMass, protonMass; double minMass=850, maxMass=3500, delta=50; vector > M; // Reads a MOWSE matrix computed by the program computeMOWSEMatrix void readMOWSEMatrix(const string fName, vector >& M, const string& scoring) { if (scoring == "spc") return; ifstream mfile(fName.c_str()); unsigned m, n; mfile >> m >> n; M.resize(m); for (unsigned i = 0; i < m; i++) M[i].resize(n); for (unsigned i = 0; i < M.size(); i++) for (unsigned j = 0; j < M[0].size(); j++) mfile >> M[i][j]; // Checks for 0-values for (unsigned j = 0; j < M[0].size(); j++){ double min = 1.0; bool zeros = false; for (unsigned i = 0; i < M.size(); i++) if (M[i][j] == 0.0) zeros = true; else if (M[i][j] < min) min = M[i][j]; for (unsigned i = 0; i < M.size(); i++) if (M[i][j] == 0.0) M[i][j] = min; } if (scoring == "pmowse"){ // Converts into probabilities for (unsigned j = 0; j < M[0].size(); j++){ double sum = 0.0; for (unsigned i = 0; i < M.size(); i++) sum += M[i][j]; for (unsigned i = 0; i < M.size(); i++) M[i][j] = log(M[i][j]/sum*M.size()); // log-odds with an equiprobable null-model } } } // readMOWSEMatrix void readAAMasses(vector& aaMasses, double& water, double& proton) { aaMasses.assign('Z', -1.0e10); aaMasses['A'] = 71.03711; aaMasses['R'] = 156.10111; aaMasses['N'] = 114.04293; aaMasses['D'] = 115.02694; aaMasses['C'] = 103.00919; aaMasses['E'] = 129.04259; aaMasses['Q'] = 128.05858; aaMasses['G'] = 57.02146; aaMasses['H'] = 137.05891; aaMasses['I'] = 113.08406; aaMasses['L'] = 113.08406; aaMasses['K'] = 128.09496; aaMasses['M'] = 131.04049; aaMasses['F'] = 147.06841; aaMasses['P'] = 97.05276; aaMasses['S'] = 87.03203; aaMasses['T'] = 101.04768; aaMasses['W'] = 186.07931; aaMasses['Y'] = 163.06333; aaMasses['V'] = 99.06841; aaMasses['C'] += 57.02146; // iodoacetamide water = 18.01056; proton = 1.00728; } // readAAMasses void readPkl(const string pkl, vector& expSpectrum) { ifstream pkfile(pkl.c_str()); string line; getline(pkfile, line); while (pkfile.good()){ int v = line.find(','); double m = atof(line.substr(0,v).c_str()); double i = atof(line.substr(v+1, line.find(',',v+1)-v).c_str()); expSpectrum.push_back(Peak(m, i)); getline(pkfile, line); } // We assume the experimental masses to be sorted. // Otherwise: sort(expSpectrum.begin(), expSpectrum.end()); } // readPkl double getPeptideMass(const string& protein, const PeptidePosition& pos) { // Computes a peptide mass double mass = waterMass; for (unsigned i = pos.first; i <= pos.last; i++) mass += aaMasses[protein[i]]; return mass; } // getPeptideMass double getPeptideMass(const string& peptide) { // Computes a peptide mass double mass = waterMass; for (unsigned i = 0; i < peptide.length(); i++) mass += aaMasses[peptide[i]]; return mass; } // getPeptideMass double getProteinMass(const string& peptide) { // Computes a protein mass, accepts B, Z, X a.a. double mass = waterMass; for (unsigned i = 0; i < peptide.length(); i++) if ((peptide[i] == 'B') || (peptide[i] == 'X') || (peptide[i] == 'Z')) mass += 100.0; // average mass else mass += aaMasses[peptide[i]]; return mass; } // getProteinMass void digestByTrypsin(const string& protein, unsigned nmc, vector& pept) { // Digestion by trypsin with a maximum of nmc missed cleavages in a peptide. The // peptides are returned as a list of sequence position in the protein variable. // Forces size=0 in case previous positions were still in pept pept.resize(0); // Computes the position of peptides without missed cleavage unsigned previous = 0; for (unsigned i = 0; i < protein.length()-1; i++) if (((protein[i] == 'K') || (protein[i] == 'R')) && (protein[i+1] != 'P')){ pept.push_back(PeptidePosition(previous, i, i-previous+1, 0)); previous = i+1; } pept.push_back(PeptidePosition(previous, protein.length()-1, protein.length()-previous, 0)); // Computes the position of peptides with up to nmc missed cleavages unsigned numPept = pept.size(); for (unsigned i = 0; i < numPept-1; i++) for (unsigned j = 1; (j <= nmc) && (i+j < numPept); j++) pept.push_back(PeptidePosition(pept[i].first, pept[i+j].last, pept[i+j].last-pept[i].first+1, j)); } // digestByTrypsin void computeMatch(const double tol, const string& protein, const vector& pept, const vector& expSpectrum, vector& matched, const double shift) { // Finds the experimental masses that fall within tol ppm to a theoretical mass matched.resize(0); for (unsigned i = 0; i < pept.size(); i++){ double theoMass = getPeptideMass(protein, pept[i])+protonMass+shift; if ((theoMass >= minMass) && (theoMass <= maxMass)){ // Dichotomic search for the closest experimental mass int left = 0; int right = expSpectrum.size()-1; int middle; while (right >= left){ middle = (right+left)/2; if (expSpectrum[middle].mass >= theoMass) right = middle-1; if (expSpectrum[middle].mass <= theoMass) left = middle+1; } int closest; if (left-right == 2) // Found the exact mass closest = middle; else if (right == -1) // Search for a mass smaller than the smallest in expSpectrum, return the smallest closest = 0; else if (left == int(expSpectrum.size())) // Search for a mass that is larger than the largest mass in expSpectrum, return the largest closest = expSpectrum.size()-1; else // Normal case closest = (expSpectrum[left].mass-theoMass < theoMass-expSpectrum[right].mass) ? left : right; double expMass = expSpectrum[closest].mass; if (fabs(theoMass-expMass)/(theoMass+expMass)*2.0e6 <= tol) // The closest experimental mass falls within tolerance tol, keep it matched.push_back(MatchedPeaks(Peak(theoMass, 1.0), expSpectrum[closest])); } } } // computeMatch void matchMass(bool recalibration, const string& protein, const vector& pept, const vector& expSpectrum, vector& matched) { // Finds the experimental masses that fall with delta ppm of a theoretical mass computeMatch(delta, protein, pept, expSpectrum, matched, 0.0); if (recalibration && matched.size()){ // Determines a mass shift double shift = 0.0; for (unsigned i = 0; i < matched.size(); i++) shift += matched[i].expPeak.mass-matched[i].theoPeak.mass; shift /= matched.size(); computeMatch(reductionFactor*delta, protein, pept, expSpectrum, matched, shift); } } // matchMass double computeSPCScore(const string& protein, const vector& pept, const vector& expSpectrum, vector& matched) { return static_cast(matched.size()); } // computeSPCScore double computeMOWSEScore(const string& protein, const vector& pept, const vector& expSpectrum, vector& matched) { if (matched.size() == 0) return noScore; double proteinMass = getProteinMass(protein); double score = 50000.0/proteinMass; unsigned j = static_cast(proteinMass/10000.0); if (j >= M[0].size()) j = M[0].size()-1; for (unsigned k = 0; k < matched.size(); k++){ unsigned i = static_cast(matched[k].theoPeak.mass/100); if (i >= M.size()) i = M.size()-1; score /= M[i][j]; } if (score <= 1.0e-10) return noScore; else return log(score); } // computeMOWSEScore double computeProbMOWSEScore(const string& protein, const vector& pept, const vector& expSpectrum, vector& matched) { if (matched.size() == 0) return noScore; double proteinMass = getProteinMass(protein); double score = log(50000.0/proteinMass); unsigned j = static_cast(proteinMass/10000.0); if (j >= M[0].size()) j = M[0].size()-1; for (unsigned k = 0; k < matched.size(); k++){ unsigned i = static_cast(matched[k].theoPeak.mass/100); if (i >= M.size()) i = M.size()-1; score += M[i][j]; } return score; } // computeProbMOWSEScore void searchFastaFile(vector& dbMatch, const string fasta, unsigned nmc, const vector& expSpectrum, const string& scoring, bool recalibration) { vector pept; vector matched; // Processes the entire file of protein sequences, WE ASSUME >AC ID ... as format string line, protein, ac, id; ifstream fastaFile(fasta.c_str()); getline(fastaFile, line); int v = line.find(' '); ac = line.substr(1, v-1); id = line.substr(v+5, line.find(' ', v+5)-v-5); while (fastaFile.good()){ protein = ""; do{ getline(fastaFile, line); if (line[0] == '>') break; else protein += line; } while (fastaFile.good()); // Digests and computes the score digestByTrypsin(protein, nmc, pept); matchMass(recalibration, protein, pept, expSpectrum, matched); double score; if (scoring == "mowse") score = computeMOWSEScore(protein, pept, expSpectrum, matched); else if (scoring == "pmowse") score = computeProbMOWSEScore(protein, pept, expSpectrum, matched); else if (scoring == "spc") score = computeSPCScore(protein, pept, expSpectrum, matched); else return; dbMatch.push_back(DBMatch(ac, id, score, matched.size(), getProteinMass(protein), pept.size())); if (fastaFile.good()){ v = line.find(' '); ac = line.substr(1, v-1); id = line.substr(v+5, line.find(' ', v+5)-v-5); } } } // searchFastaFile double gaussianTail(const double z) { // Based upon algorithm 5666 for the error function, from: // Hart, J.F. et al, 'Computer Approximations', Wiley 1968 static double p[7] = {220.2068679123761, 221.2135961699311, 112.0792914978709, 33.91286607838300, 6.373962203531650, 0.7003830644436881, 0.03526249659989109}; static double q[8] = {440.4137358247522, 793.8265125199484, 637.3336333788311, 296.5642487796737, 86.78073220294608, 16.06417757920695, 1.755667163182642, 0.08838834764831844}; if(z > 37.) return 0.0; if(z < -37) return 1.0; double cutOff = 7.7071; double root2Pi = 2.506628274631001; double zabs = fabs(z); double expntl = exp(-.5*zabs*zabs); double pdf = expntl/root2Pi; double tail; if(zabs < cutOff) tail = expntl*((((((p[6]*zabs + p[5])*zabs + p[4])*zabs + p[3])*zabs + p[2])*zabs + p[1])*zabs + p[0])/(((((((q[7]*zabs + q[6])*zabs + q[5])*zabs + q[4])*zabs + q[3])*zabs + q[2])*zabs + q[1])*zabs + q[0]); else tail = pdf/(zabs + 1./(zabs + 2./(zabs + 3./(zabs + 4./(zabs + 0.65))))); return (z > 0.0) ? tail : 1.0-tail; } // gaussianTail void outputResults(vector& dbMatch, const string& scoring) { // Finds the best results sort(dbMatch.begin(), dbMatch.end()); double maxScore = dbMatch.back().score; // Estimates p-values by fitting a distribution with all but the nBest best scores if ((scoring == "mowse") || (scoring == "pmowse")){ // Assumes a Gaussian distribution of the random scores double mean = 0.0, xx = 0.0; unsigned count = 0; for (unsigned i = 0; i < dbMatch.size()-nBest; i++) if (dbMatch[i].score != noScore){ register double v = dbMatch[i].score; mean += v; xx += v*v; count++; } mean /= static_cast(count); xx /= static_cast(count); if (xx > mean*mean){ double sd = sqrt(xx-mean*mean); int num = 0; double lastScore = maxScore; for (int i = dbMatch.size()-1; (i >= 0) && ((num < nBest) || (dbMatch[i].score == lastScore)); i--) if (dbMatch[i].score != noScore){ dbMatch[i].pValue = gaussianTail((dbMatch[i].score-mean)/sd); num++; lastScore = dbMatch[i].score; } } } else if (scoring == "spc"){ // Assume an exponential distribution of the random scores double mean = 0.0; for (unsigned i = 0; i < dbMatch.size()-nBest; i++) mean += dbMatch[i].score; mean /= dbMatch.size(); double lambda = 1.0/mean; int num = 0; for (int i = dbMatch.size()-1; (i >= 0) && ((i >= dbMatch.size()-nBest) || (dbMatch[i].score == dbMatch[dbMatch.size()-nBest].score)); i--, num++) dbMatch[i].pValue = exp(-lambda*dbMatch[i].score); } // Prints the nBest first sequences int num = 0; double lastScore = maxScore; for (int i = dbMatch.size()-1; (i >= 0) && ((num < nBest) || (dbMatch[i].score == lastScore)); i--) if (dbMatch[i].score != noScore){ cout << num+1 << ": " << dbMatch[i].ac << "\t" << dbMatch[i].id << "\t" << dbMatch[i].score << "\t" << dbMatch[i].nbPept << "\t" << dbMatch[i].pValue << "\t" << dbMatch[i].proteinMass << "\t" << dbMatch[i].nbTheoPept << endl; lastScore = dbMatch[i].score; num++; } if (verbose) // Outputs the "random" score empirical distribution for (unsigned i = 0; i < dbMatch.size(); i++) if (dbMatch[i].score != noScore) cerr << i+1 << "\t" << dbMatch[i].score << endl; } // outputResults void usage() { cerr << "Usage: pmfDBSearch [options] -fasta -pkl \n\nOptions are:\n"; cerr << "-help\n"; cerr << "-verbose prints more details about the computation\n"; cerr << "-fasta a protein sequence sequence database in the fasta format\n"; cerr << "-pkl experimental PMF data in the pkl format\n"; cerr << "-nmc maximum number of missed cleavages, default is 1\n"; cerr << "-minmass minimum mass considered in Daltons, default is 850\n"; cerr << "-maxmass maximum mass considered in Daltons, default 3500\n"; cerr << "-delta mass tolerance in ppm, default is 50\n"; cerr << "-scoring scoring function name (mowse|pmowse|spc)\n"; cerr << "-recalibration triggers a posteriori recalibration\n\n"; exit(0); } // usage int main(int argc, char* argv[]) { string fasta, pkl; unsigned nmc=1; string scoring = "mowse"; bool recalibration = false; readAAMasses(aaMasses, waterMass, protonMass); // Scans the command line for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-help") == 0) usage(); else if ((strcmp(argv[i], "-fasta") == 0) && (i < argc-1)) fasta = string(argv[++i]); else if ((strcmp(argv[i], "-pkl") == 0) && (i < argc-1)) pkl = string(argv[++i]); else if ((strcmp(argv[i], "-nmc") == 0) && (i < argc-1)) nmc = atoi(argv[++i]) >= 0 ? atoi(argv[i]) : 0; else if ((strcmp(argv[i], "-minmass") == 0) && (i < argc-1)) minMass = atof(argv[++i]); else if ((strcmp(argv[i], "-maxmass") == 0) && (i < argc-1)) maxMass = atof(argv[++i]); else if ((strcmp(argv[i], "-delta") == 0) && (i < argc-1)) delta = atof(argv[++i]); else if ((strcmp(argv[i], "-scoring") == 0) && (i < argc-1)) scoring = string(argv[++i]); else if (strcmp(argv[i], "-verbose") == 0) verbose = 1; else if (strcmp(argv[i], "-recalibration") == 0) recalibration = true; } // Reads experimental data vector expSpectrum; readPkl(pkl, expSpectrum); // Searches the DB readMOWSEMatrix("MOWSE.txt", M, scoring); vector dbMatch; searchFastaFile(dbMatch, fasta, nmc, expSpectrum, scoring, recalibration); // Print results outputResults(dbMatch, scoring); return 0; } // main