src/core/parsing.cpp
00001 #include <cmath>
00002 #include "parsing.h"
00003 
00004 namespace BioLCCC
00005 {
00006 ParsingException::ParsingException(std::string message):
00007         BioLCCCException(message) 
00008 {
00009 };
00010 
00011 std::vector<double> calculateMonomerEnergyProfile(
00012     const std::vector<ChemicalGroup> &parsedSequence,
00013     const ChemicalBasis & chemBasis,
00014     const double secondSolventConcentration,
00015     const double columnRelativeStrength, 
00016     const double temperature) throw (BioLCCCException)
00017 {
00018     if (parsedSequence.size() < 3)
00019     {
00020         throw BioLCCCException(
00021             "The parsed sequence contains too little chemical groups.");
00022     }
00023 
00024     if (columnRelativeStrength == 0.0)
00025     {
00026         return std::vector<double> (parsedSequence.size()-2, 0.0);
00027     }
00028 
00029     // Due to the preliminary scaling the binding energy of water equals zero.
00030     double Q = exp(columnRelativeStrength * 
00031                    (chemBasis.secondSolventBindEnergy() - 0.0) *
00032                    293.0 / temperature);
00033     // Nb = (DensityB * %B / MB) / 
00034     //      (DensityB * %B / MB + DensityA * %A / MA)
00035     // where DensityA and DensityB are the corresponding densities and
00036     // MA and MB are the corresponding molecular weights.
00037     double Nb =
00038         secondSolventConcentration * chemBasis.secondSolventDensity()
00039         / chemBasis.secondSolventAverageMass() 
00040         / ( secondSolventConcentration * chemBasis.secondSolventDensity() 
00041                 / chemBasis.secondSolventAverageMass()
00042             + (100.0 - secondSolventConcentration) 
00043               * chemBasis.firstSolventDensity()
00044               / chemBasis.firstSolventAverageMass());
00045 
00046     double Eab = 0.0;
00047     if (chemBasis.snyderApproximation()) 
00048     {
00049         Eab = Nb * chemBasis.secondSolventBindEnergy();
00050     }
00051     else
00052     {
00053         Eab = 0.0 + 1.0 / columnRelativeStrength * log( 1.0 - Nb + Nb * Q );
00054     }
00055 
00056     std::vector<double> monomerEnergyProfile;
00057     for (std::vector<BioLCCC::ChemicalGroup>::const_iterator residue =
00058                 ++(parsedSequence.begin());
00059             residue != --(parsedSequence.end());
00060             ++residue)
00061     {
00062         double residueEnergy = residue->bindEnergy();
00063         double residueArea = residue->bindArea();
00064 
00065         // Adding the energy of the N-terminal group to the first residue.
00066         if (residue == ++(parsedSequence.begin()))
00067         {
00068             residueEnergy += parsedSequence.begin()->bindEnergy();
00069             residueArea += parsedSequence.begin()->bindArea();
00070         }
00071 
00072         // Adding the energy of the C-terminal group to the last residue.
00073         else if (residue == --(--(parsedSequence.end())))
00074         {
00075             residueEnergy += (--(parsedSequence.end()))->bindEnergy();
00076             residueArea += (--(parsedSequence.end()))->bindArea();
00077         }
00078 
00079         monomerEnergyProfile.push_back(
00080             columnRelativeStrength * 
00081                 //(residueEnergy - Eab) * 293.0 / temperature);
00082                 (residueEnergy - residueArea * Eab) * 293.0 / temperature);
00083     }
00084     return monomerEnergyProfile;
00085 }
00086 
00087 std::vector<double> calculateSegmentEnergyProfile(
00088     const std::vector<double> &monomerEnergyProfile,
00089     const double monomerLength,
00090     const double kuhnLength)
00091 {
00092     std::vector<double> segmentEnergyProfile; 
00093     std::vector<double>::const_iterator monomerEnergyIt =
00094         monomerEnergyProfile.begin();
00095     double kuhnLeftBorder  = 0;
00096     double monomerLeftBorder  = 0;
00097     double sumEnergy = 0.0;
00098     bool kuhnSegmentOpen = true;
00099 
00100     while (monomerEnergyIt != monomerEnergyProfile.end()) 
00101     {
00102         if ((kuhnLeftBorder + kuhnLength) >= 
00103                 (monomerLeftBorder + monomerLength))
00104         {
00105             sumEnergy += *(monomerEnergyIt) * 
00106                 (monomerLeftBorder + monomerLength - 
00107                     std::max(monomerLeftBorder, kuhnLeftBorder)) / 
00108                 monomerLength;
00109             kuhnSegmentOpen = true;
00110             monomerLeftBorder += monomerLength;
00111             ++monomerEnergyIt;
00112         }
00113         else 
00114         {
00115             sumEnergy += *(monomerEnergyIt) * 
00116                 (kuhnLeftBorder + kuhnLength - 
00117                     std::max(monomerLeftBorder, kuhnLeftBorder)) / 
00118                 monomerLength;
00119             segmentEnergyProfile.push_back(sumEnergy);
00120             sumEnergy = 0.0;
00121             kuhnSegmentOpen = false;
00122             kuhnLeftBorder += kuhnLength;
00123         }
00124     }
00125 
00126     if (kuhnSegmentOpen)
00127     {
00128         segmentEnergyProfile.push_back(sumEnergy);
00129     }
00130 
00131     return segmentEnergyProfile;
00132 }
00133 
00134 std::vector<ChemicalGroup> parseSequence(
00135     const std::string &source,
00136     const ChemicalBasis &chemBasis) throw(BioLCCCException)
00137 {
00138     std::vector<ChemicalGroup> parsedSequence;
00139     ChemicalGroup NTerminus;
00140     ChemicalGroup CTerminus;
00141 
00142     // At first we need to strip the sequence from adjacent amino acids.
00143     // If a source sequence contains them, it should contain two dots, so we
00144     // need the part of sequence between them.
00145     std::size_t firstDotPosition = 0;
00146     std::size_t secondDotPosition = 0;
00147 
00148     // We'll use this variable to contain peptide sequence without adjacent
00149     // amino acids.
00150     std::string strippedSource = source;
00151 
00152     firstDotPosition = source.find(".");
00153 
00154     if (firstDotPosition != std::string::npos)
00155     {
00156         secondDotPosition = source.find(".", firstDotPosition+1);
00157         if (secondDotPosition != std::string::npos)
00158         {
00159 
00160             // If a source sequence contains more that two dots, it's broken.
00161             if (source.find(".", secondDotPosition+1) != std::string::npos)
00162             {
00163                 throw ParsingException(
00164                     "The sequence " + source +" contains more than two dots.");
00165             }
00166             else
00167             {
00168                 strippedSource = source.substr(firstDotPosition+1,
00169                     secondDotPosition - firstDotPosition - 1);
00170             }
00171         }
00172         // If a source sequence contains only one dot, it's broken.
00173         else
00174         {
00175             throw ParsingException(
00176                 "The sequence " + source + " contains only one dot.");
00177         }
00178     }
00179 
00180     // Than goes parsing.
00181     std::size_t NTerminusPosition = 0;
00182 
00183     // First we need to check the stripped source sequence for
00184     // the N-Terminal group.
00185     NTerminus = chemBasis.defaultNTerminus();
00186     for (std::map<std::string,ChemicalGroup>::const_iterator
00187             NTerminusIterator = chemBasis.chemicalGroups().begin();
00188             NTerminusIterator != chemBasis.chemicalGroups().end();
00189             NTerminusIterator++)
00190     {
00191         if (NTerminusIterator->second.isNTerminal())
00192         {
00193             if (strippedSource.find(NTerminusIterator->second.label()) ==
00194                     (size_t)0)
00195             {
00196                 NTerminus = NTerminusIterator->second;
00197                 NTerminusPosition = NTerminusIterator->second.label().size();
00198             }
00199         }
00200     }
00201 
00202     // Then we need to found the location of the C-Terminus.
00203     CTerminus = chemBasis.defaultCTerminus();
00204     std::size_t CTerminusPosition;
00205     CTerminusPosition = strippedSource.find("-", NTerminusPosition);
00206     if (CTerminusPosition != std::string::npos)
00207     {
00208         // The sequence should not contain hyphens after C-terminal group.
00209         if (strippedSource.find("-", CTerminusPosition+1) != std::string::npos)
00210         {
00211             throw ParsingException(
00212                 "The sequence " + source +
00213                 " contains hyphen after C-terminal group.");
00214         }
00215 
00216         // Searching for known C-terminal groups.
00217         bool CTerminusFound = false;
00218         for (std::map<std::string,ChemicalGroup>::const_iterator
00219                 CTerminusIterator = chemBasis.chemicalGroups().begin();
00220                 CTerminusIterator != chemBasis.chemicalGroups().end();
00221                 CTerminusIterator++)
00222         {
00223             if (CTerminusIterator->second.isCTerminal())
00224             {
00225                 if (strippedSource.find(CTerminusIterator->second.label(),
00226                     CTerminusPosition) != std::string::npos)
00227                 {
00228                     CTerminus = CTerminusIterator->second;
00229                     CTerminusFound = true;
00230                 }
00231             }
00232         }
00233         if (!CTerminusFound)
00234         {
00235             throw ParsingException(
00236                 "The sequence " + source +
00237                 " contains unknown C-terminal group\"" + 
00238                 source.substr(CTerminusPosition) + "\".");
00239         }
00240     }
00241     else
00242     {
00243         CTerminusPosition = strippedSource.size();
00244     }
00245 
00246     // At this step we obtain the sequence of a peptide without adjacent
00247     // amino acids and terminal groups.
00248     strippedSource = strippedSource.substr(
00249         NTerminusPosition, CTerminusPosition-NTerminusPosition);
00250 
00251     // We need to check whether it contains any non-letter characters.
00252     for (std::size_t i=0; i<strippedSource.size(); i++)
00253     {
00254         if (!(((int(strippedSource[i]) >= int('a')) &&
00255                 (int(strippedSource[i]) <= int('z'))) ||
00256                 ((int(strippedSource[i]) >= int('A')) &&
00257                  (int(strippedSource[i]) <= int('Z')))))
00258         {
00259             throw ParsingException(
00260                 "The sequence " + source +
00261                 " contains a non-letter character.");
00262         }
00263     }
00264 
00265     // Then we divide the whole sequence into aminoacids.
00266     bool aminoAcidFound;
00267     size_t curPos = 0;
00268     while (curPos < strippedSource.size())
00269     {
00270         aminoAcidFound = false;
00271         for (std::map<std::string,ChemicalGroup>::const_iterator
00272                 aminoAcidIterator = chemBasis.chemicalGroups().begin();
00273                 aminoAcidIterator != chemBasis.chemicalGroups().end();
00274                 aminoAcidIterator++)
00275         {
00276             if (strippedSource.compare(curPos,
00277                 aminoAcidIterator->second.label().size(),
00278                 aminoAcidIterator->second.label()) == 0)
00279             {
00280                 curPos += aminoAcidIterator->second.label().size();
00281                 parsedSequence.push_back(aminoAcidIterator->second);
00282                 aminoAcidFound = true;
00283                 break;
00284             }
00285         }
00286 
00287         if (!aminoAcidFound)
00288         {
00289             throw ParsingException(
00290                 "The sequence " + source + " contains unknown amino acid \"" + 
00291                 source.substr(curPos, 1) + "\".");
00292         }
00293     }
00294     parsedSequence.insert(parsedSequence.begin(), NTerminus);
00295     parsedSequence.push_back(CTerminus);
00296     return parsedSequence;
00297 }
00298 }
00299 
 All Classes Namespaces Functions Variables Enumerations Enumerator