#include <iostream>
#include <fstream>
#include <vector>
#include <string>
// MOWSE matrix computation
// Copyright (C) 2005, 2006 Jacques Colinge
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
// Contact:
// Dr. Jacques Colinge
// CeMM
// Lazarettgasse 19/3
// A-1090 Vienna, Austria
// http://www.cemm.at
// Computation of the MOWSE matrix from a database of protein sequences in the fasta format. Compiled with g++ and tested
// under Linux.
using namespace std;
class PeptidePosition
{
public:
PeptidePosition():first(-1),last(-1),length(-1),nmc(0) {}
PeptidePosition(int f, int l, int len, int n){
first = f;
last = l;
length = len;
nmc = n;
}
~PeptidePosition(){}
int first, last, length, nmc;
}; // class PeptidePosition
// Global variables to store basic masses
vector<double> aaMasses;
double waterMass, protonMass;
void readAAMasses(vector<double>& aaMasses, double& water, double& proton)
{
aaMasses.assign('Z', -1.0e10);
aaMasses['A'] = 71.03711;
aaMasses['R'] = 156.10111;
aaMasses['N'] = 114.04293;
aaMasses['D'] = 115.02694;
aaMasses['C'] = 103.00919;
aaMasses['E'] = 129.04259;
aaMasses['Q'] = 128.05858;
aaMasses['G'] = 57.02146;
aaMasses['H'] = 137.05891;
aaMasses['I'] = 113.08406;
aaMasses['L'] = 113.08406;
aaMasses['K'] = 128.09496;
aaMasses['M'] = 131.04049;
aaMasses['F'] = 147.06841;
aaMasses['P'] = 97.05276;
aaMasses['S'] = 87.03203;
aaMasses['T'] = 101.04768;
aaMasses['W'] = 186.07931;
aaMasses['Y'] = 163.06333;
aaMasses['V'] = 99.06841;
aaMasses['C'] += 57.02146; // iodoacetamide
water = 18.01056;
proton = 1.00728;
} // readAAMasses
double getPeptideMass(const string& protein, const PeptidePosition& pos)
{
// Computes a peptide mass
double mass = waterMass;
for (int i = pos.first; i <= pos.last; i++){
if ((protein[i] == 'B') || (protein[i] == 'X') || (protein[i] == 'Z'))
return 0.0;
mass += aaMasses[protein[i]];
}
return mass;
} // getPeptideMass
double getPeptideMass(const string& peptide)
{
// Computes a peptide mass
double mass = waterMass;
for (int i = 0; i < peptide.length(); i++){
if ((peptide[i] == 'B') || (peptide[i] == 'X') || (peptide[i] == 'Z'))
return 0.0;
mass += aaMasses[peptide[i]];
}
return mass;
} // getPeptideMass
void digestByTrypsin(const string& protein, int nmc, vector<PeptidePosition>& pept)
{
// Digestion by trypsin with a maximum of nmc missed cleavages in a peptide. The
// peptides are returned as a list of sequence position in the protein variable.
// Forces size=0 in case previous positions were still in pept
pept.resize(0);
// Computes the position of peptides without missed cleavage
int previous = 0;
for (int i = 0; i < protein.length()-1; i++)
if (((protein[i] == 'K') || (protein[i] == 'R')) && (protein[i+1] != 'P')){
pept.push_back(PeptidePosition(previous, i, i-previous+1, 0));
previous = i+1;
}
pept.push_back(PeptidePosition(previous, protein.length()-1, protein.length()-previous, 0));
// Computes the position of peptides with up to nmc missed cleavages
int numPept = pept.size();
for (int i = 0; i < numPept-1; i++)
for (int j = 1; (j <= nmc) && (i+j < numPept); j++)
pept.push_back(PeptidePosition(pept[i].first, pept[i+j].last, pept[i+j].last-pept[i].first+1, j));
} // digestByTrypsin
void computeMatrix(vector<vector<unsigned> >& M, const string fasta, int nmc)
{
// Digests all the proteins of a fasta file and collect the frequencies of peptide masses.
vector<PeptidePosition> pept;
M.resize(40);
for (unsigned i = 0; i < M.size(); i++)
M[i].assign(30, 0);
// Processes the entire file of protein sequences
string line, protein;
ifstream fastaFile(fasta.c_str());
getline(fastaFile, line); // Skips first fasta header
while (fastaFile.good()){
protein = "";
do{
getline(fastaFile, line);
if (line[0] == '>')
break;
else
protein += line;
} while (fastaFile.good());
double proteinMass = getPeptideMass(protein, PeptidePosition(0, protein.length()-1, protein.length(), 0));
digestByTrypsin(protein, nmc, pept);
unsigned j = static_cast<unsigned>(proteinMass/10000.0);
for (unsigned k = 0; k < pept.size(); k++){
double mass = getPeptideMass(protein, pept[k])+protonMass;
unsigned i = static_cast<unsigned>(mass/100.0);
if (i >= M.size())
i = M.size()-1;
if (j >= M[0].size())
j = M[0].size()-1;
// Counts
M[i][j]++;
}
}
// Finds the max counts per protein mass range
vector<unsigned> maxCount;
for (unsigned j = 0; j < M[0].size(); j++){
unsigned max = M[0][j];
for (unsigned k = 0; k < M.size(); k++)
if (M[k][j] > max)
max = M[k][j];
maxCount.push_back(max);
}
// Prints the MOWSE matrix on standard output
cout << M.size() << "\t" << M[0].size() << endl;
for (unsigned i = 0; i < M.size(); i++){
for (unsigned j = 0; j < M[0].size(); j++){
if (j > 0)
cout << "\t";
cout << static_cast<double>(M[i][j])/maxCount[j];
}
cout << endl;
}
} // computeMatrix
int main(int argc, char* argv[])
{
string fasta;
int nmc=1;
readAAMasses(aaMasses, waterMass, protonMass);
// Scans the command line
for (int i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-help") == 0){
cerr << "Usage: computeMOWSEMatrix -fasta <filename> -nmc <int>\n";
exit(0);
}
else if ((strcmp(argv[i], "-fasta") == 0) && (i < argc-1))
fasta = string(argv[++i]);
else if ((strcmp(argv[i], "-nmc") == 0) && (i < argc-1))
nmc = atoi(argv[++i]);
}
// Digests everything
vector<vector<unsigned> > M;
computeMatrix(M, fasta, nmc);
return 0;
} // main