GCC Code Coverage Report


source/XpertMassCore/src/
File: source/XpertMassCore/src/MonomerDictionary.cpp
Date: 2025-11-20 01:41:33
Lines:
0/121
0.0%
Functions:
0/11
0.0%
Branches:
0/150
0.0%

Line Branch Exec Source
1 /* BEGIN software license
2 *
3 * MsXpertSuite - mass spectrometry software suite
4 * -----------------------------------------------
5 * Copyright(C) 2009,...,2018 Filippo Rusconi
6 *
7 * http://www.msxpertsuite.org
8 *
9 * This file is part of the MsXpertSuite project.
10 *
11 * The MsXpertSuite project is the successor of the massXpert project. This
12 * project now includes various independent modules:
13 *
14 * - massXpert, model polymer chemistries and simulate mass spectrometric data;
15 * - mineXpert, a powerful TIC chromatogram/mass spectrum viewer/miner;
16 *
17 * This program is free software: you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation, either version 3 of the License, or
20 * (at your option) any later version.
21 *
22 * This program is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License
28 * along with this program. If not, see <http://www.gnu.org/licenses/>.
29 *
30 * END software license
31 */
32
33
34 /////////////////////// Qt includes
35 #include <QObject>
36 #include <QFile>
37
38
39 /////////////////////// Local includes
40 #include "MsXpS/libXpertMassCore/MonomerDictionary.hpp"
41 #include "MsXpS/libXpertMassCore/Sequence.hpp"
42
43 namespace MsXpS
44 {
45 namespace libXpertMassCore
46 {
47
48 /*!
49 \class MsXpS::libXpertMassCore::MonomerDictionary
50 \inmodule libXpertMassCore
51 \ingroup XpertMassCoreUtilities
52 \inheaderfile MonomerDictionary.hpp
53
54 \brief The MonomerDictionary class provides a Monomer code dictionary allowing
55 the user to automatically translate Monomer codes from x-letter codes to
56 y-letter codes. For example, a monomer dictionary file can define how to
57 translate 3-letter monomer codes to 1-letter codes. This is typically useful
58 when working on Protein Database (PDB) file.
59
60 The format of the dictionary file is the following:
61
62 \code
63 # Converts from code on the left of '>' to code on the right.
64 # Number of letters allowed in each code is
65 # described with syntax 3>1 and that line should be the first
66 # non-comment line in this file.
67 3>1
68 ALA>A
69 CYS>C
70 ASP>D
71 \endcode
72
73 There might be more than one \e{section} in the file, with, for example, 3>1
74 translations and then 1>3 translations.
75 */
76
77 /*!
78 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_filePath
79
80 \brief Path to the file documenting the translations.
81 */
82
83 /*!
84 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_dictionaryHash
85
86 \brief The hash that documents the translations.
87 */
88
89 /*!
90 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_dictionaryLoaded
91
92 \brief Indicates if the dictionary file has been loaded already.
93 */
94
95 /*!
96 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_inputChainStringList
97
98 \brief The list of sequences to be converted.
99 */
100
101 /*!
102 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_inputCodeLength
103
104 \brief The count of letters in the input Monomer code.
105
106 In a dictionary file that has a section
107
108 \code
109 3>1
110 \endcode
111
112 this value would be \e 3.
113 */
114
115 /*!
116 \variable MsXpS::libXpertMassCore::MonomerDictionary::m_outputCodeLength
117
118 \brief The count of letters in the output Monomer code.
119
120 In a dictionary file that has a section
121
122 \code
123 3>1
124 \endcode
125
126 this value would be \e 1.
127 */
128
129 /*!
130 \brief Constructs a MonomerDictionary instance.
131
132 \list
133 \li \a file_path: the path to the file containing the Monomer dictionary.
134 \li \a input_chain_string_list: the list of sequences to be converted.
135 \li \a input_code_length: the count of letters in the Monomer codes in the input string.
136 \li \a output_code_length: the count of letters in the Monomer codes in the output string.
137 \endlist
138 */
139 MonomerDictionary::MonomerDictionary(QString file_path,
140 const QStringList &input_chain_string_list,
141 int input_code_length,
142 int output_code_length)
143 : m_filePath(file_path),
144 m_inputChainStringList(input_chain_string_list),
145 m_inputCodeLength(input_code_length),
146 m_outputCodeLength(output_code_length)
147 {
148 }
149
150 /*!
151 \brief Destructs this MonomerDictionary instance.
152 */
153 MonomerDictionary::~MonomerDictionary()
154 {
155 }
156
157 /*!
158 \brief Sets the \a file_path to the Monomer dictionary file.
159 */
160 void
161 MonomerDictionary::setFilePath(QString &file_path)
162 {
163 m_filePath = file_path;
164 }
165
166 /*!
167 \brief Sets the list of input sequences to \a input_chain_string_list.
168 */
169 void
170 MonomerDictionary::setInputChainStringList(
171 const QStringList &input_chain_string_list)
172 {
173 m_inputChainStringList = input_chain_string_list;
174 }
175
176 /*!
177 \brief Set the count of letters in the input Monomer codes to \a code_length.
178 */
179 void
180 MonomerDictionary::setInputCodeLength(int code_length)
181 {
182 m_inputCodeLength = code_length;
183 }
184
185 /*!
186 \brief Set the count of letters in the output Monomer codes to \a code_length.
187 */
188 void
189 MonomerDictionary::setOutputCodeLength(int code_length)
190 {
191 m_outputCodeLength = code_length;
192 }
193
194 /*!
195 \brief Return true if the \a line parsed is in the form X>Y, that is, that it specifies the kind of Monomer code translation.
196 */
197 bool
198 MonomerDictionary::isLineProperSectionDivider(const QString &line)
199 {
200 // Section dividers in the monomer dictionary file format are
201 // lines containing the following syntax: X>Y, that is for example
202 // 3>1. This means that the following translation rules (like
203 // ILE>I) should convert 3-letter codes into 1-letter codes.
204
205 // However, this line should only be considered proper if X is
206 // actually the value of m_inputCodeLength and Y the value of
207 // m_outputCodeLength.
208
209 // qDebug() << __FILE__ << __LINE__
210 // << "Checking if line is proper section divider :" << line;
211
212 if(line.contains(QRegularExpression("[0-9]+>[0-9]+")))
213 {
214 // We are opening a new section, get the input/output code
215 // lengths and if they math what we expect, then set the
216 // current stream position and call the section parser.
217
218 int greaterThanIndex = line.indexOf('>');
219
220 QString codeLengthString = line.left(greaterThanIndex);
221
222 // qDebug() << __FILE__ << __LINE__
223 // << "Left codeLengthString:" << codeLengthString
224 // << "m_inputCodeLength:" << m_inputCodeLength;
225
226 bool ok = false;
227 int codeLength = codeLengthString.toInt(&ok, 10);
228
229 if(!codeLength && !ok)
230 {
231 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary"
232 << "Failed to parse file " << m_filePath << "at line "
233 << line;
234
235 return false;
236 }
237
238 if(codeLength != m_inputCodeLength)
239 {
240 return false;
241 }
242
243 codeLengthString = line.mid(greaterThanIndex + 1, -1);
244
245 // qDebug() << __FILE__ << __LINE__
246 // << "Right codeLengthString:" << codeLengthString
247 // << "m_outputCodeLength:" << m_outputCodeLength;
248
249 ok = false;
250 codeLength = codeLengthString.toInt(&ok, 10);
251
252 if(!codeLength && !ok)
253 {
254 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary"
255 << "Failed to parse file " << m_filePath << "at line "
256 << line;
257
258 return false;
259 }
260
261 if(codeLength != m_outputCodeLength)
262 {
263 return false;
264 }
265
266 // At this point, it seems we are in the proper
267 // section.
268
269 return true;
270 }
271
272 // If we are here, that means that the section is not for us.
273
274 // qDebug() << __FILE__ << __LINE__
275 // << "Line is no proper section divider.";
276
277 return false;
278 }
279
280 void
281 MonomerDictionary::skipSection(QTextStream *stream)
282 {
283 // We have entered a section, all we have to do is go through it
284 // and return when we have found either the end of the stream or
285 // the {END} marker.
286
287 qint64 lineLength = 1024;
288 QString line;
289
290 while(!stream->atEnd())
291 {
292 line = stream->readLine(lineLength);
293
294 if(!line.contains("{END}"))
295 {
296 line = stream->readLine(lineLength);
297 }
298 else
299 return;
300 }
301 }
302
303 /*!
304 \brief Parses the Monomer dictionary file section in \a stream and fills in the \l m_dictionaryHash with the translation pair.
305 */
306 int
307 MonomerDictionary::parseSection(QTextStream *stream)
308 {
309 Q_ASSERT(stream);
310
311 qint64 lineLength = 1024;
312 QString line;
313
314 // Iterate in the file using the stream and for each line create
315 // an item to insert into the dictionary hash.
316
317 while(!stream->atEnd())
318 {
319 line = stream->readLine(lineLength);
320
321 // We might encounter the end of the section, that is a line
322 // having {END} as its sole content.
323
324 if(line.contains("{END}"))
325 break;
326
327 QStringList stringList = line.split('>');
328
329 QString inputCode = stringList.first();
330 QString outputCode = stringList.last();
331
332 // Check that the monomer codes have the proper length.
333
334 if(inputCode.length() != m_inputCodeLength ||
335 outputCode.length() != m_outputCodeLength)
336 {
337 qDebug() << __FILE__ << __LINE__ << QObject::tr("Monomer dictionary:")
338 << QObject::tr("Failed to load dictionary.")
339 << QObject::tr("Monomer code lengths do not match:")
340 << QObject::tr("inputCode:") << inputCode
341 << QObject::tr("outputCode:") << outputCode;
342
343
344 // We have to empty the hash
345 m_dictionaryHash.clear();
346
347 break;
348 }
349
350 m_dictionaryHash.insert(inputCode, outputCode);
351
352 // qDebug() << __FILE__ << __LINE__
353 // << stringList.first () << stringList.last ();
354 }
355
356 // At this point the parsing is finished, either because we
357 // encountered the {END} section-ending delimiter, or because we
358 // reached the en of file.
359
360 int hashSize = m_dictionaryHash.size();
361
362 if(hashSize)
363 m_dictionaryLoaded = true;
364 else
365 {
366 qDebug() << __FILE__ << __LINE__ << QObject::tr("Monomer dictionary:")
367 << QObject::tr("Failed to load dictionary.");
368
369 m_dictionaryLoaded = false;
370 }
371
372 return hashSize;
373 }
374
375 /*!
376 \brief Returns true if the Monomer dictionary file could be loaded successfully, false otherwise.
377 */
378 bool
379 MonomerDictionary::loadDictionary()
380 {
381 // Load the file and for each line deconstruct the item into two
382 // QString objects that are used to make a QHash entry in
383 // QHash<QString, QString> m_dictionaryHash.
384 bool success = true;
385 qint64 lineLength = 1024;
386 QString line;
387
388 QFile file(m_filePath);
389
390 if(!file.open(QIODevice::ReadOnly))
391 {
392
393 m_dictionaryLoaded = false;
394
395 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary:"
396 << "Failed to open file" << m_filePath << "for writing.";
397
398 return false;
399 }
400
401 if(m_inputCodeLength < 1 || m_outputCodeLength < 1)
402 {
403 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary:"
404 << "Failed to parse file " << m_filePath
405 << "Please, set the m_inputCodeLength and "
406 "m_ouputCodeLength variables first.";
407
408 return false;
409 }
410
411 QTextStream *stream = new QTextStream(&file);
412 stream->setEncoding(QStringConverter::Utf8);
413
414 while(!stream->atEnd())
415 {
416 line = stream->readLine(lineLength);
417
418 // qDebug() << __FILE__ << __LINE__
419 // << "line: " << line;
420
421 // Remove spaces from start and end of line.
422 line = line.simplified();
423
424 if(line.startsWith('#') || line.isEmpty())
425 {
426 line = stream->readLine(lineLength);
427 continue;
428 }
429
430 // There might be any number of sections in the file, all
431 // delimited with a X>Y directive, indicating how many
432 // characters are allowed for the input code and for the
433 // output code.
434
435 if(!isLineProperSectionDivider(line))
436 {
437 // qDebug() << __FILE__ << __LINE__
438 // << "skipping line:" << line;
439
440 line = stream->readLine(lineLength);
441 continue;
442 }
443 else
444 {
445 // qDebug() << __FILE__ << __LINE__
446 // << "parsing section: " << line;
447
448 if(parseSection(stream) < 1)
449 {
450 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary:"
451 << "Failed to parse file " << m_filePath;
452
453 success = false;
454 break;
455 }
456 else
457 {
458 // We successfully parsed the section. Our work is done.
459
460 success = true;
461 break;
462 }
463 }
464 }
465
466 delete stream;
467
468 return success;
469 }
470
471 /*!
472 \brief Perform the actual translation from the input Monomer code form to the output Monomer code form on all the strings contained in \a chain_string_list.
473 */
474 QStringList *
475 MonomerDictionary::translate(const QStringList &chain_string_list)
476 {
477 // The string in sequence is a space-separated list of monomer
478 // codes in the original monomer code format. We have to translate
479 // that to the proper monomer code format using the hash in this
480 // dictionary.
481
482 QStringList *outputChainStringList = new QStringList();
483
484 if(!chain_string_list.isEmpty())
485 m_inputChainStringList = chain_string_list;
486
487 // If there is nothing to do return an empty string list so that
488 // caller knows nothing is actually wrong, only there is no
489 // sequence to translate.
490 if(m_inputChainStringList.isEmpty())
491 return outputChainStringList;
492
493 // Iterate in each chain string of the list and perform the
494 // translation.
495
496 for(int iter = 0; iter < m_inputChainStringList.size(); ++iter)
497 {
498 QString iterString = chain_string_list.at(iter);
499
500 // qDebug() << __FILE__ << __LINE__
501 // << "translating sequence:" << iterString;
502
503 QStringList codeList =
504 iterString.split(QRegularExpression("\\s+"), Qt::SkipEmptyParts);
505
506 // qDebug() << __FILE__ << __LINE__
507 // << "codeList:" << codeList;
508
509 // qDebug() << __FILE__ << __LINE__
510 // << "hash:"
511 // << m_dictionaryHash;
512
513 for(int jter = 0; jter < codeList.size(); ++jter)
514 {
515 QString code = codeList.at(jter);
516
517 QHash<QString, QString>::const_iterator hashIter =
518 m_dictionaryHash.find(code);
519
520 if(hashIter != m_dictionaryHash.end())
521 codeList.replace(jter, hashIter.value());
522 else
523 {
524 // Delete the string list, set the pointer to 0 and
525 // return that pointer so that caller knows something
526 // has gone wrong.
527
528 qDebug() << __FILE__ << __LINE__ << "Monomer dictionary:"
529 << "Failed to convert monomer code " << code;
530
531 outputChainStringList->clear();
532
533 delete outputChainStringList;
534 outputChainStringList = nullptr;
535
536 return outputChainStringList;
537 }
538 }
539
540 // At this point the sequence codes have been translated. Join all
541 // the item of the codeList into one single string.
542
543 outputChainStringList->append(codeList.join(QString("")));
544 }
545
546 // End of
547 // for (int iter = 0; iter < chainStringList.size(); ++iter)
548
549 // If no translation could be performed, return a n
550
551 if(!outputChainStringList->size())
552 {
553 outputChainStringList->clear();
554
555 delete outputChainStringList;
556 outputChainStringList = 0;
557 }
558
559 return outputChainStringList;
560 }
561
562
563 } // namespace libXpertMassCore
564 } // namespace MsXpS
565