@@ -30,7 +30,7 @@ struct Histogram
3030 unsigned upperQuantileDist; // Distance from median to 95% quantile.
3131 unsigned windowSize; // The size of one window in bp's.
3232
33- Histogram () :
33+ Histogram ():
3434 values (" " ),
3535 min_prob (0.0 ),
3636 offset (0 ),
@@ -44,10 +44,37 @@ struct Histogram
4444 // ---------------------------------------------------------------------------------------
4545 // Function clearStrings()
4646 // ---------------------------------------------------------------------------------------
47- // Clears all member strings.
48- inline void clearStrings ()
47+ // Clears all member strings. If shrink is true, the object will be shrunk to capacity 0.
48+ inline void clearStrings (bool shrink= false )
4949 {
5050 clear (values);
51+ if (shrink)
52+ {
53+ resize (values, 0 );
54+ shrinkToFit (values);
55+ }
56+ else
57+ {
58+ clear (values);
59+ }
60+ }
61+ // Return the size of the histogram in byte
62+ inline unsigned getSize () const
63+ {
64+ return (sizeof (values) +
65+ length (values) * sizeof (double ) +
66+ 4 * sizeof (double ) +
67+ 1 * sizeof (int ) +
68+ 5 * sizeof (unsigned ));
69+ }
70+ // Return the capcity of the histogram in byte
71+ inline unsigned getCapactiy () const
72+ {
73+ return (sizeof (values) +
74+ capacity (values) * sizeof (double ) +
75+ 4 * sizeof (double ) +
76+ 1 * sizeof (int ) +
77+ 5 * sizeof (unsigned ));
5178 }
5279};
5380inline unsigned getHistLeftBorder (const Histogram & hist)
@@ -227,7 +254,7 @@ inline void writeIndexIntoHeader(TStream & stream, String<String<uint64_t> > pro
227254 }
228255
229256 // Move current position of stream back to ending.
230- stream.seekp (last);
257+ stream.seekp (last);
231258}
232259
233260// =======================================================================================
@@ -240,7 +267,8 @@ inline void readProfileHeader(TStream & stream,
240267 String<Histogram> & histograms,
241268 String<CharString> & contigNames,
242269 String<int32_t > & contigLengths,
243- unsigned & indexRegionSize)
270+ unsigned & indexRegionSize,
271+ bool dropContigs = false )
244272{
245273 // Read the magic string.
246274 CharString buffer;
@@ -291,8 +319,8 @@ inline void readProfileHeader(TStream & stream,
291319 msg << " Profile seems to have 0 read groups." ;
292320 SEQAN_THROW (ParseError (toCString (msg.str ())));
293321 }
294- resize (readGroups, numReadGroups);
295- resize (histograms, numReadGroups);
322+ resize (readGroups, numReadGroups, Exact () );
323+ resize (histograms, numReadGroups, Exact () );
296324
297325 // Read the read group names and histograms.
298326 for (unsigned i = 0 ; i < numReadGroups; ++i)
@@ -307,7 +335,7 @@ inline void readProfileHeader(TStream & stream,
307335 }
308336
309337 // Read the read group name.
310- resize (readGroups[i], nameLen-1 );
338+ resize (readGroups[i], nameLen-1 , Exact () );
311339 stream.read (reinterpret_cast <char *>(&readGroups[i][0 ]), nameLen-1 );
312340 stream.read (&buffer[0 ], 1 );
313341 if (!stream.good ())
@@ -352,7 +380,7 @@ inline void readProfileHeader(TStream & stream,
352380 msg << " Unable to read insert size histogram size." ;
353381 SEQAN_THROW (ParseError (toCString (msg.str ())));
354382 }
355- resize (histograms[i].values , histEnd - histograms[i].offset );
383+ resize (histograms[i].values , histEnd - histograms[i].offset , Exact () );
356384 for (unsigned h = 0 ; h < histEnd - histograms[i].offset ; ++h)
357385 {
358386 stream.read (reinterpret_cast <char *>(&histograms[i].values [h]), sizeof (double ));
@@ -372,9 +400,11 @@ inline void readProfileHeader(TStream & stream,
372400 msg << " Unable to read number of contigs." ;
373401 SEQAN_THROW (ParseError (toCString (msg.str ())));
374402 }
375- resize (contigNames, numContigs);
376- resize (contigLengths, numContigs);
377-
403+ if (!dropContigs)
404+ {
405+ resize (contigNames, numContigs, Exact ());
406+ resize (contigLengths, numContigs, Exact ());
407+ }
378408 // Read the chromosome names and lengths.
379409 for (unsigned i = 0 ; i < numContigs; ++i)
380410 {
@@ -388,26 +418,38 @@ inline void readProfileHeader(TStream & stream,
388418 }
389419
390420 // Read the chromosome name.
391- resize (contigNames[i], nameLen-1 );
392- stream.read (reinterpret_cast <char *>(&contigNames[i][0 ]), nameLen-1 );
393- stream.read (&buffer[0 ], 1 );
394- if (!stream.good ())
395- {
396- msg << " Unable to read contig name." ;
397- SEQAN_THROW (ParseError (toCString (msg.str ())));
398- }
399- if (buffer[0 ] != ' \0 ' ) // Expect chromosome names to be null-terminated.
421+ if (!dropContigs)
400422 {
401- msg << " Expecting contig name to be null-terminated." ;
402- SEQAN_THROW (ParseError (toCString (msg.str ())));
403- }
423+ resize (contigNames[i], nameLen-1 , Exact ());
424+ stream.read (reinterpret_cast <char *>(&contigNames[i][0 ]), nameLen-1 );
425+ stream.read (&buffer[0 ], 1 );
426+ if (!stream.good ())
427+ {
428+ msg << " Unable to read contig name." ;
429+ SEQAN_THROW (ParseError (toCString (msg.str ())));
430+ }
431+ if (buffer[0 ] != ' \0 ' ) // Expect chromosome names to be null-terminated.
432+ {
433+ msg << " Expecting contig name to be null-terminated." ;
434+ SEQAN_THROW (ParseError (toCString (msg.str ())));
435+ }
404436
405- // Read the chromosome length.
406- stream.read (reinterpret_cast <char *>(&contigLengths[i]), sizeof (int32_t ));
407- if (!stream.good ())
437+ // Read the chromosome length.
438+ stream.read (reinterpret_cast <char *>(&contigLengths[i]), sizeof (int32_t ));
439+ if (!stream.good ())
440+ {
441+ msg << " Unable to read contig length." ;
442+ SEQAN_THROW (ParseError (toCString (msg.str ())));
443+ }
444+ }
445+ else
408446 {
409- msg << " Unable to read contig length." ;
410- SEQAN_THROW (ParseError (toCString (msg.str ())));
447+ stream.ignore (nameLen + sizeof (int32_t ));
448+ if (!stream.good ())
449+ {
450+ msg << " Unable to skip contig name and length." ;
451+ SEQAN_THROW (ParseError (toCString (msg.str ())));
452+ }
411453 }
412454 }
413455}
@@ -790,7 +832,7 @@ inline bool checkUniqueRG(std::map<CharString, unsigned> & readGroups,
790832 if (readGroups.count (readGroup) != 0 )
791833 {
792834 std::ostringstream msg;
793- msg << " WARNING: Duplicate insert size histogram of read group \' "
835+ msg << " WARNING: Duplicate insert size histogram of read group \' "
794836 << readGroup << " \' . Skipping it in file \' " << filename << " \' ." ;
795837 printStatus (msg);
796838 return false ;
@@ -884,7 +926,9 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms, /
884926 String<String<CharString> > & contigNames,
885927 String<String<int32_t > > & contigLengths,
886928 String<unsigned > & indexRegionSizes,
887- bool smoothing,
929+ const bool & smoothing,
930+ const bool & modRgByFileName,
931+ const bool & representativeContigs,
888932 const unsigned & pseudoCountFraction)
889933{
890934 std::ifstream infile = tryOpenHistogram (filename);
@@ -893,27 +937,91 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms, /
893937 String<CharString> sampleContigNames;
894938 String<int32_t > sampleContigLengths; // TODO: Make this more efficient by directly writing to the final sets.
895939 unsigned indexRegionSize = 0 ;
940+ bool dropContigs = !empty (contigNames) && representativeContigs;
896941 readProfileHeader (infile,
897942 filename,
898943 sampleReadGroups,
899944 sampleHistograms,
900945 sampleContigNames,
901946 sampleContigLengths,
902- indexRegionSize);
947+ indexRegionSize,
948+ dropContigs);
949+ if (representativeContigs && !empty (indexRegionSizes))
950+ {
951+ if (indexRegionSize != indexRegionSizes[0 ])
952+ {
953+ std::ostringstream msg;
954+ msg << " Index region sizes are not equal! Terminating." ;
955+ SEQAN_THROW (IOError (toCString (msg.str ())));
956+ }
957+ }
903958 // Process all histograms of the sample and add them.
904959 unsigned rg = length (readGroups);
960+ reserve (rgs, length (sampleReadGroups), Exact ());
961+ bool rgAdded = false ;
905962 for (unsigned i = 0 ; i < length (sampleReadGroups); ++i)
906963 {
907964 if (checkUniqueRG (readGroups, rg, sampleReadGroups[i], filename))
908965 {
909966 appendValue (rgs, rg - 1 );
967+ rgAdded = true ;
910968 processHistogram (sampleHistograms[i], 256 , smoothing, pseudoCountFraction);
911969 appendValue (histograms, sampleHistograms[i]);
970+ sampleHistograms[i].clearStrings (true );
971+ }
972+ // Hacky part for enhancing read group IDs with the file name. Use on your own risk.
973+ else if (modRgByFileName)
974+ {
975+ for (unsigned i = 0 ; i < length (sampleReadGroups); ++i)
976+ {
977+ CharString newRGName = getSampleName (filename);
978+ append (newRGName, " :" );
979+ append (newRGName, sampleReadGroups[i]);
980+ std::cout << " WARNING: Internally replacing read group ID \' " << sampleReadGroups[i]
981+ << " \' of file \' " << filename << " \' with \' " << newRGName <<" \' " << std::endl;
982+ sampleReadGroups[i] = newRGName;
983+ }
984+ if (checkUniqueRG (readGroups, rg, sampleReadGroups[i], filename))
985+ {
986+ appendValue (rgs, rg - 1 );
987+ rgAdded = true ;
988+ processHistogram (sampleHistograms[i], 256 , smoothing, pseudoCountFraction);
989+ appendValue (histograms, sampleHistograms[i]);
990+ }
991+ else
992+ {
993+ std::ostringstream msg;
994+ msg << " This should not have happened... Tried to de-duplicate read groups of \' " << filename
995+ << " \' << by adding the file name to the IDs, but something went wrong. Terminating." ;
996+ SEQAN_THROW (IOError (toCString (msg.str ())));
997+ }
912998 }
999+ else
1000+ {
1001+ std::ostringstream msg;
1002+ msg << " Previously observed read groups detected in file \' " << filename << " \' . "
1003+ << " Please make sure that all read group IDs of your cohort are unique or "
1004+ << " remove the sample(s) in question. If you are sure that the duplicate read group names are OK "
1005+ << " please use the option \' -e\' /'--per-sample-rgid\' to resolve the conflict. Terminating." ;
1006+ SEQAN_THROW (IOError (toCString (msg.str ())));
1007+ }
1008+ }
1009+ if (!rgAdded)
1010+ {
1011+ std::ostringstream msg;
1012+ msg << " All read group IDs of sample \' " << filename
1013+ << " \' are duplicates of previously observed read group IDs."
1014+ << " Please make sure that all read group IDs of your cohort are unique "
1015+ << " or remove the sample(s) in question. If you are sure that the duplicate read group names are OK "
1016+ << " please use the option \' -e\' /'--per-sample-rgid\' to resolve the conflict. Terminating." ;
1017+ SEQAN_THROW (IOError (toCString (msg.str ())));
1018+ }
1019+ if (!dropContigs)
1020+ {
1021+ appendValue (contigNames, sampleContigNames);
1022+ appendValue (contigLengths, sampleContigLengths);
1023+ appendValue (indexRegionSizes, indexRegionSize);
9131024 }
914- appendValue (indexRegionSizes, indexRegionSize);
915- appendValue (contigNames, sampleContigNames);
916- appendValue (contigLengths, sampleContigLengths);
9171025}
9181026// Overload for applying loadInsertSizeHistograms on multiple files.
9191027inline void loadInsertSizeHistograms (String<Histogram> & histograms,
@@ -923,12 +1031,30 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms,
9231031 String<String<CharString> > & contigNames,
9241032 String<String<int32_t > > & contigLengths,
9251033 String<unsigned > & indexRegionSizes,
926- bool smoothing,
1034+ const bool & smoothing,
1035+ const bool & modRgByFileName,
1036+ const bool & representativeContigs,
9271037 const unsigned & pseudoCountFraction)
9281038{
929- resize (rgs, length (filenames), Exact ());
1039+ unsigned n = length (filenames);
1040+ resize (rgs, n, Exact ());
1041+ reserve (histograms, n, Exact ());
1042+ if (representativeContigs)
1043+ {
1044+ reserve (contigNames, 1 , Exact ());
1045+ reserve (contigLengths, 1 , Exact ());
1046+ reserve (indexRegionSizes, 1 , Exact ());
1047+ }
1048+ else
1049+ {
1050+ reserve (contigNames, n, Exact ());
1051+ reserve (contigLengths, n, Exact ());
1052+ reserve (indexRegionSizes, n, Exact ());
1053+ }
9301054 unsigned sampleNum = 0 ;
931- for (unsigned i = 0 ; i < length (filenames); ++i)
1055+ unsigned long long totalSize = 0 ;
1056+
1057+ for (unsigned i = 0 ; i < n; ++i)
9321058 {
9331059 loadInsertSizeHistograms (histograms,
9341060 readGroups,
@@ -938,12 +1064,19 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms,
9381064 contigLengths,
9391065 indexRegionSizes,
9401066 smoothing,
1067+ modRgByFileName,
1068+ representativeContigs,
9411069 pseudoCountFraction);
1070+ unsigned histSize = histograms[i].getSize ();
1071+ totalSize += histSize;
9421072 std::ostringstream msg;
943- msg << " Loaded histogram from file \' " << filenames[i] << " \' ." ;
1073+ msg << " Loaded histogram from file \' " << filenames[i] << " \' in " << histSize << " byte ." ;
9441074 printStatus (msg);
9451075 ++sampleNum;
9461076 }
1077+ std::ostringstream msg;
1078+ msg << " Total memory reserved for histograms amounts to " << totalSize << " byte." ;
1079+ printStatus (msg);
9471080}
9481081// -----------------------------------------------------------------------------
9491082// Function I()
0 commit comments