Skip to content

Commit 0d20535

Browse files
Sebastian NiehusSebastian Niehus
authored andcommitted
Various fixes and improvements.
Improve memory management for many samples in PopDel call. Add option '-e'/'--per-sample-rgid' to resolve conflicting read group names in PopDel profile. Fix "too big offset in window"-crash in PopDel profile. Fix bug in automatic determination of output profile path in PopDel profile.
1 parent 2dd626d commit 0d20535

10 files changed

Lines changed: 220 additions & 61 deletions

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ SEQAN_LIB=.
1313
CXXFLAGS+=-I$(SEQAN_LIB) -DSEQAN_HAS_ZLIB=1 -DSEQAN_USE_HTSLIB=1 -std=c++14 -DSEQAN_DISABLE_VERSION_CHECK
1414
LDLIBS=-lz -lpthread -lhts
1515

16-
DATE=on 2020-11-12
17-
VERSION=1.3.0
16+
DATE=on 2021-03-03
17+
VERSION=1.4.0
1818
CXXFLAGS+=-DDATE=\""$(DATE)"\" -DVERSION=\""$(VERSION)"\"
1919

2020
# Enable warnings

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,10 @@ See wiki for more information on how to view the profile with [PopDel View](http
5151
## Citation
5252
Sebastian Niehus, Hákon Jónsson, Janina Schönberger, Eythór Björnsson, Doruk Beyter, Hannes P. Eggertsson, Patrick Sulem, Kári Stefánsson, Bjarni V. Halldórsson, Birte Kehr. _PopDel identifies medium-size deletions simultaneously in tens of thousands of genomes_. Nat Commun 12, 730 (2021). https://doi.org/10.1038/s41467-020-20850-5
5353

54-
5554
## Version and License
5655
```
57-
Last update: 2020-11-12
58-
PopDel version: 1.3.0
56+
Last update: 2021-03-03
57+
PopDel version: 1.4.0
5958
SeqAn version: 2.1 (with HTSlib support added by Hannes P.Eggertsson)
6059
Author: Sebastian Niehus (Sebastian.Niehus[at]ukr.de)
6160
```

insert_histogram_popdel.h

Lines changed: 172 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct Histogram
3030
unsigned upperQuantileDist; // Distance from median to 95% quantile.
3131
unsigned windowSize; // The size of one window in bp's.
3232

33-
Histogram() :
33+
Histogram():
3434
values(""),
3535
min_prob(0.0),
3636
offset(0),
@@ -44,10 +44,37 @@ struct Histogram
4444
// ---------------------------------------------------------------------------------------
4545
// Function clearStrings()
4646
// ---------------------------------------------------------------------------------------
47-
// Clears all member strings.
48-
inline void clearStrings()
47+
// Clears all member strings. If shrink is true, the object will be shrunk to capacity 0.
48+
inline void clearStrings(bool shrink=false)
4949
{
5050
clear(values);
51+
if (shrink)
52+
{
53+
resize(values, 0);
54+
shrinkToFit(values);
55+
}
56+
else
57+
{
58+
clear(values);
59+
}
60+
}
61+
// Return the size of the histogram in byte
62+
inline unsigned getSize() const
63+
{
64+
return (sizeof(values) +
65+
length(values) * sizeof(double) +
66+
4 * sizeof(double) +
67+
1 * sizeof(int) +
68+
5 * sizeof(unsigned));
69+
}
70+
// Return the capcity of the histogram in byte
71+
inline unsigned getCapactiy() const
72+
{
73+
return (sizeof(values) +
74+
capacity(values) * sizeof(double) +
75+
4 * sizeof(double) +
76+
1 * sizeof(int) +
77+
5 * sizeof(unsigned));
5178
}
5279
};
5380
inline unsigned getHistLeftBorder(const Histogram & hist)
@@ -227,7 +254,7 @@ inline void writeIndexIntoHeader(TStream & stream, String<String<uint64_t> > pro
227254
}
228255

229256
// Move current position of stream back to ending.
230-
stream.seekp(last);
257+
stream.seekp(last);
231258
}
232259

233260
// =======================================================================================
@@ -240,7 +267,8 @@ inline void readProfileHeader(TStream & stream,
240267
String<Histogram> & histograms,
241268
String<CharString> & contigNames,
242269
String<int32_t> & contigLengths,
243-
unsigned & indexRegionSize)
270+
unsigned & indexRegionSize,
271+
bool dropContigs = false)
244272
{
245273
// Read the magic string.
246274
CharString buffer;
@@ -291,8 +319,8 @@ inline void readProfileHeader(TStream & stream,
291319
msg << "Profile seems to have 0 read groups.";
292320
SEQAN_THROW(ParseError(toCString(msg.str())));
293321
}
294-
resize(readGroups, numReadGroups);
295-
resize(histograms, numReadGroups);
322+
resize(readGroups, numReadGroups, Exact());
323+
resize(histograms, numReadGroups, Exact());
296324

297325
// Read the read group names and histograms.
298326
for (unsigned i = 0; i < numReadGroups; ++i)
@@ -307,7 +335,7 @@ inline void readProfileHeader(TStream & stream,
307335
}
308336

309337
// Read the read group name.
310-
resize(readGroups[i], nameLen-1);
338+
resize(readGroups[i], nameLen-1, Exact());
311339
stream.read(reinterpret_cast<char *>(&readGroups[i][0]), nameLen-1);
312340
stream.read(&buffer[0], 1);
313341
if (!stream.good())
@@ -352,7 +380,7 @@ inline void readProfileHeader(TStream & stream,
352380
msg << "Unable to read insert size histogram size.";
353381
SEQAN_THROW(ParseError(toCString(msg.str())));
354382
}
355-
resize(histograms[i].values, histEnd - histograms[i].offset);
383+
resize(histograms[i].values, histEnd - histograms[i].offset, Exact());
356384
for (unsigned h = 0; h < histEnd - histograms[i].offset; ++h)
357385
{
358386
stream.read(reinterpret_cast<char *>(&histograms[i].values[h]), sizeof(double));
@@ -372,9 +400,11 @@ inline void readProfileHeader(TStream & stream,
372400
msg << "Unable to read number of contigs.";
373401
SEQAN_THROW(ParseError(toCString(msg.str())));
374402
}
375-
resize(contigNames, numContigs);
376-
resize(contigLengths, numContigs);
377-
403+
if (!dropContigs)
404+
{
405+
resize(contigNames, numContigs, Exact());
406+
resize(contigLengths, numContigs, Exact());
407+
}
378408
// Read the chromosome names and lengths.
379409
for (unsigned i = 0; i < numContigs; ++i)
380410
{
@@ -388,26 +418,38 @@ inline void readProfileHeader(TStream & stream,
388418
}
389419

390420
// Read the chromosome name.
391-
resize(contigNames[i], nameLen-1);
392-
stream.read(reinterpret_cast<char *>(&contigNames[i][0]), nameLen-1);
393-
stream.read(&buffer[0], 1);
394-
if (!stream.good())
395-
{
396-
msg << "Unable to read contig name.";
397-
SEQAN_THROW(ParseError(toCString(msg.str())));
398-
}
399-
if (buffer[0] != '\0') // Expect chromosome names to be null-terminated.
421+
if (!dropContigs)
400422
{
401-
msg << "Expecting contig name to be null-terminated.";
402-
SEQAN_THROW(ParseError(toCString(msg.str())));
403-
}
423+
resize(contigNames[i], nameLen-1, Exact());
424+
stream.read(reinterpret_cast<char *>(&contigNames[i][0]), nameLen-1);
425+
stream.read(&buffer[0], 1);
426+
if (!stream.good())
427+
{
428+
msg << "Unable to read contig name.";
429+
SEQAN_THROW(ParseError(toCString(msg.str())));
430+
}
431+
if (buffer[0] != '\0') // Expect chromosome names to be null-terminated.
432+
{
433+
msg << "Expecting contig name to be null-terminated.";
434+
SEQAN_THROW(ParseError(toCString(msg.str())));
435+
}
404436

405-
// Read the chromosome length.
406-
stream.read(reinterpret_cast<char *>(&contigLengths[i]), sizeof(int32_t));
407-
if (!stream.good())
437+
// Read the chromosome length.
438+
stream.read(reinterpret_cast<char *>(&contigLengths[i]), sizeof(int32_t));
439+
if (!stream.good())
440+
{
441+
msg << "Unable to read contig length.";
442+
SEQAN_THROW(ParseError(toCString(msg.str())));
443+
}
444+
}
445+
else
408446
{
409-
msg << "Unable to read contig length.";
410-
SEQAN_THROW(ParseError(toCString(msg.str())));
447+
stream.ignore(nameLen + sizeof(int32_t));
448+
if (!stream.good())
449+
{
450+
msg << "Unable to skip contig name and length.";
451+
SEQAN_THROW(ParseError(toCString(msg.str())));
452+
}
411453
}
412454
}
413455
}
@@ -790,7 +832,7 @@ inline bool checkUniqueRG(std::map<CharString, unsigned> & readGroups,
790832
if (readGroups.count(readGroup) != 0)
791833
{
792834
std::ostringstream msg;
793-
msg << "WARNING: Duplicate insert size histogram of read group \'"
835+
msg << "WARNING: Duplicate insert size histogram of read group \'"
794836
<< readGroup << "\'. Skipping it in file \'" << filename << "\'.";
795837
printStatus(msg);
796838
return false;
@@ -884,7 +926,9 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms, /
884926
String<String<CharString> > & contigNames,
885927
String<String<int32_t> > & contigLengths,
886928
String<unsigned> & indexRegionSizes,
887-
bool smoothing,
929+
const bool & smoothing,
930+
const bool & modRgByFileName,
931+
const bool & representativeContigs,
888932
const unsigned & pseudoCountFraction)
889933
{
890934
std::ifstream infile = tryOpenHistogram(filename);
@@ -893,27 +937,91 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms, /
893937
String<CharString> sampleContigNames;
894938
String<int32_t> sampleContigLengths; // TODO: Make this more efficient by directly writing to the final sets.
895939
unsigned indexRegionSize = 0;
940+
bool dropContigs = !empty(contigNames) && representativeContigs;
896941
readProfileHeader(infile,
897942
filename,
898943
sampleReadGroups,
899944
sampleHistograms,
900945
sampleContigNames,
901946
sampleContigLengths,
902-
indexRegionSize);
947+
indexRegionSize,
948+
dropContigs);
949+
if (representativeContigs && !empty(indexRegionSizes))
950+
{
951+
if (indexRegionSize != indexRegionSizes[0])
952+
{
953+
std::ostringstream msg;
954+
msg << "Index region sizes are not equal! Terminating.";
955+
SEQAN_THROW(IOError(toCString(msg.str())));
956+
}
957+
}
903958
// Process all histograms of the sample and add them.
904959
unsigned rg = length(readGroups);
960+
reserve(rgs, length(sampleReadGroups), Exact());
961+
bool rgAdded = false;
905962
for (unsigned i = 0; i < length(sampleReadGroups); ++i)
906963
{
907964
if (checkUniqueRG(readGroups, rg, sampleReadGroups[i], filename))
908965
{
909966
appendValue(rgs, rg - 1);
967+
rgAdded = true;
910968
processHistogram(sampleHistograms[i], 256, smoothing, pseudoCountFraction);
911969
appendValue(histograms, sampleHistograms[i]);
970+
sampleHistograms[i].clearStrings(true);
971+
}
972+
// Hacky part for enhancing read group IDs with the file name. Use on your own risk.
973+
else if (modRgByFileName)
974+
{
975+
for(unsigned i = 0; i < length(sampleReadGroups); ++i)
976+
{
977+
CharString newRGName = getSampleName(filename);
978+
append(newRGName, ":");
979+
append(newRGName, sampleReadGroups[i]);
980+
std::cout << "WARNING: Internally replacing read group ID \'" << sampleReadGroups[i]
981+
<< "\' of file \'" << filename << "\' with \'"<< newRGName <<"\'" << std::endl;
982+
sampleReadGroups[i] = newRGName;
983+
}
984+
if (checkUniqueRG(readGroups, rg, sampleReadGroups[i], filename))
985+
{
986+
appendValue(rgs, rg - 1);
987+
rgAdded = true;
988+
processHistogram(sampleHistograms[i], 256, smoothing, pseudoCountFraction);
989+
appendValue(histograms, sampleHistograms[i]);
990+
}
991+
else
992+
{
993+
std::ostringstream msg;
994+
msg << "This should not have happened... Tried to de-duplicate read groups of \'" << filename
995+
<< "\' << by adding the file name to the IDs, but something went wrong. Terminating.";
996+
SEQAN_THROW(IOError(toCString(msg.str())));
997+
}
912998
}
999+
else
1000+
{
1001+
std::ostringstream msg;
1002+
msg << "Previously observed read groups detected in file \'" << filename << "\'. "
1003+
<< "Please make sure that all read group IDs of your cohort are unique or "
1004+
<< "remove the sample(s) in question. If you are sure that the duplicate read group names are OK "
1005+
<< "please use the option \'-e\'/'--per-sample-rgid\' to resolve the conflict. Terminating.";
1006+
SEQAN_THROW(IOError(toCString(msg.str())));
1007+
}
1008+
}
1009+
if (!rgAdded)
1010+
{
1011+
std::ostringstream msg;
1012+
msg << "All read group IDs of sample \'" << filename
1013+
<< "\' are duplicates of previously observed read group IDs."
1014+
<< "Please make sure that all read group IDs of your cohort are unique "
1015+
<< "or remove the sample(s) in question. If you are sure that the duplicate read group names are OK "
1016+
<< "please use the option \'-e\'/'--per-sample-rgid\' to resolve the conflict. Terminating.";
1017+
SEQAN_THROW(IOError(toCString(msg.str())));
1018+
}
1019+
if (!dropContigs)
1020+
{
1021+
appendValue(contigNames, sampleContigNames);
1022+
appendValue(contigLengths, sampleContigLengths);
1023+
appendValue(indexRegionSizes, indexRegionSize);
9131024
}
914-
appendValue(indexRegionSizes, indexRegionSize);
915-
appendValue(contigNames, sampleContigNames);
916-
appendValue(contigLengths, sampleContigLengths);
9171025
}
9181026
//Overload for applying loadInsertSizeHistograms on multiple files.
9191027
inline void loadInsertSizeHistograms(String<Histogram> & histograms,
@@ -923,12 +1031,30 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms,
9231031
String<String<CharString> > & contigNames,
9241032
String<String<int32_t> > & contigLengths,
9251033
String<unsigned> & indexRegionSizes,
926-
bool smoothing,
1034+
const bool & smoothing,
1035+
const bool & modRgByFileName,
1036+
const bool & representativeContigs,
9271037
const unsigned & pseudoCountFraction)
9281038
{
929-
resize(rgs, length(filenames), Exact());
1039+
unsigned n = length(filenames);
1040+
resize(rgs, n, Exact());
1041+
reserve(histograms, n, Exact());
1042+
if (representativeContigs)
1043+
{
1044+
reserve(contigNames, 1, Exact());
1045+
reserve(contigLengths, 1, Exact());
1046+
reserve(indexRegionSizes, 1, Exact());
1047+
}
1048+
else
1049+
{
1050+
reserve(contigNames, n, Exact());
1051+
reserve(contigLengths, n, Exact());
1052+
reserve(indexRegionSizes, n, Exact());
1053+
}
9301054
unsigned sampleNum = 0;
931-
for (unsigned i = 0; i < length(filenames); ++i)
1055+
unsigned long long totalSize = 0;
1056+
1057+
for (unsigned i = 0; i < n; ++i)
9321058
{
9331059
loadInsertSizeHistograms(histograms,
9341060
readGroups,
@@ -938,12 +1064,19 @@ inline void loadInsertSizeHistograms(String<Histogram> & histograms,
9381064
contigLengths,
9391065
indexRegionSizes,
9401066
smoothing,
1067+
modRgByFileName,
1068+
representativeContigs,
9411069
pseudoCountFraction);
1070+
unsigned histSize = histograms[i].getSize();
1071+
totalSize += histSize;
9421072
std::ostringstream msg;
943-
msg << "Loaded histogram from file \'" << filenames[i] << "\'.";
1073+
msg << "Loaded histogram from file \'" << filenames[i] << "\' in " << histSize << " byte.";
9441074
printStatus(msg);
9451075
++sampleNum;
9461076
}
1077+
std::ostringstream msg;
1078+
msg << "Total memory reserved for histograms amounts to " << totalSize << " byte.";
1079+
printStatus(msg);
9471080
}
9481081
// -----------------------------------------------------------------------------
9491082
// Function I()

parse_popdel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ template<typename TParams>
4444
ArgumentParser::ParseResult parseCommandLine(TParams & params, int argc, char const ** argv)
4545
{
4646
// Setup the parser.
47-
ArgumentParser parser(argv[0]);
47+
ArgumentParser parser("PopDel");
4848
setupParser(parser, params);
4949
// Parse the command line and write error messages to error stream.
5050
std::ostringstream errorStream;
@@ -72,7 +72,7 @@ ArgumentParser::ParseResult parseCommandLine(TParams & params, int argc, char co
7272
ArgumentParser::ParseResult parseCommandLine(PopDelViewParameters & params, int argc, char const ** argv)
7373
{
7474
// Setup the parser.
75-
ArgumentParser parser(argv[0]);
75+
ArgumentParser parser("PopDel");
7676
setupParser(parser, params);
7777
// Parse the command line and write error messages to error stream.
7878
std::ostringstream errorStream;

0 commit comments

Comments
 (0)