Skip to content

Commit 03aeb78

Browse files
author
Jonathan Bieler
committed
generator support, index support, update XAM, disable VCF
1 parent cfaa56e commit 03aeb78

27 files changed

+95
-63
lines changed

.github/workflows/CI.yml

100644100755
File mode changed.

.github/workflows/CompatHelper.yml

100644100755
File mode changed.

.github/workflows/TagBot.yml

100644100755
File mode changed.

.github/workflows/documentation.yml

100644100755
File mode changed.

.gitignore

100644100755
File mode changed.

LICENSE

100644100755
File mode changed.

Manifest.toml

100644100755
+25-37
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@
22

33
julia_version = "1.10.0"
44
manifest_format = "2.0"
5-
project_hash = "88cdf5bf224b9670f958866657ca51fb8330c936"
5+
project_hash = "08c55c789910edb857eb1c8aac4f01c644757cad"
66

77
[[deps.Artifacts]]
88
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
99

1010
[[deps.Automa]]
11-
deps = ["ScanByte", "TranscodingStreams"]
12-
git-tree-sha1 = "48e54446df62fdf9ef76959c32dc33f3cff659ee"
11+
deps = ["PrecompileTools", "TranscodingStreams"]
12+
git-tree-sha1 = "014bc22d6c400a7703c0f5dc1fdc302440cf88be"
1313
uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
14-
version = "0.8.3"
14+
version = "1.0.4"
1515

1616
[[deps.BGZFStreams]]
1717
deps = ["CodecZlib"]
18-
git-tree-sha1 = "a9c80401403c068c02784cd53d417d3a82e2d2bd"
18+
git-tree-sha1 = "3aca54d25f8c30056577aa37ea68184da68df685"
1919
uuid = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
20-
version = "0.3.1"
20+
version = "0.3.2"
2121

2222
[[deps.Base64]]
2323
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -30,15 +30,15 @@ version = "3.1.0"
3030

3131
[[deps.BioGenerics]]
3232
deps = ["TranscodingStreams"]
33-
git-tree-sha1 = "0b581906418b93231d391b5dd78831fdc2da0c82"
33+
git-tree-sha1 = "017562e86afcd2a6a2a9220606a40b54604887c9"
3434
uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
35-
version = "0.1.2"
35+
version = "0.1.5"
3636

3737
[[deps.BioSequences]]
3838
deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"]
39-
git-tree-sha1 = "6cf406ea0d5bc901eabe55fca609007bce2db026"
39+
git-tree-sha1 = "6fdba8b4279460fef5674e9aa2dac7ef5be361d5"
4040
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
41-
version = "3.1.5"
41+
version = "3.1.6"
4242

4343
[[deps.BioSymbols]]
4444
deps = ["PrecompileTools"]
@@ -48,15 +48,15 @@ version = "5.1.3"
4848

4949
[[deps.CodecZlib]]
5050
deps = ["TranscodingStreams", "Zlib_jll"]
51-
git-tree-sha1 = "9c209fb7536406834aa938fb149964b985de6c83"
51+
git-tree-sha1 = "b8fe8546d52ca154ac556809e10c75e6e7430ac8"
5252
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
53-
version = "0.7.1"
53+
version = "0.7.5"
5454

5555
[[deps.Compat]]
56-
deps = ["UUIDs"]
57-
git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d"
56+
deps = ["TOML", "UUIDs"]
57+
git-tree-sha1 = "b1c55339b7c6c350ee89f2c1604299660525b248"
5858
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
59-
version = "4.7.0"
59+
version = "4.15.0"
6060
weakdeps = ["Dates", "LinearAlgebra"]
6161

6262
[deps.Compat.extensions]
@@ -69,9 +69,9 @@ version = "1.0.5+1"
6969

7070
[[deps.DataStructures]]
7171
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
72-
git-tree-sha1 = "cf25ccb972fec4e4817764d01c82386ae94f77b4"
72+
git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82"
7373
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
74-
version = "0.18.14"
74+
version = "0.18.20"
7575

7676
[[deps.Dates]]
7777
deps = ["Printf"]
@@ -123,21 +123,21 @@ uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
123123
version = "0.3.23+2"
124124

125125
[[deps.OrderedCollections]]
126-
git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282"
126+
git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
127127
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
128-
version = "1.6.0"
128+
version = "1.6.3"
129129

130130
[[deps.PrecompileTools]]
131131
deps = ["Preferences"]
132-
git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81"
132+
git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
133133
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
134-
version = "1.1.2"
134+
version = "1.2.1"
135135

136136
[[deps.Preferences]]
137137
deps = ["TOML"]
138-
git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
138+
git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
139139
uuid = "21216c6a-2e73-6563-6e65-726566657250"
140-
version = "1.4.0"
140+
version = "1.4.3"
141141

142142
[[deps.Printf]]
143143
deps = ["Unicode"]
@@ -151,18 +151,6 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
151151
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
152152
version = "0.7.0"
153153

154-
[[deps.SIMD]]
155-
deps = ["PrecompileTools"]
156-
git-tree-sha1 = "0e270732477b9e551d884e6b07e23bb2ec947790"
157-
uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
158-
version = "3.4.5"
159-
160-
[[deps.ScanByte]]
161-
deps = ["Libdl", "SIMD"]
162-
git-tree-sha1 = "d49e35f413186528f1d7cc675e67d0ed16fd7800"
163-
uuid = "7b38b023-a4d7-4c5e-8d43-3f3097f304eb"
164-
version = "0.4.0"
165-
166154
[[deps.Serialization]]
167155
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
168156

@@ -195,9 +183,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
195183

196184
[[deps.XAM]]
197185
deps = ["Automa", "BGZFStreams", "BioAlignments", "BioGenerics", "BioSequences", "GenomicFeatures", "Indexes", "Printf", "TranscodingStreams"]
198-
git-tree-sha1 = "634c3177fdf7d52498d4b947080bc73be833eeab"
186+
git-tree-sha1 = "e402710abf3f0bbed192896851bef8e483cf7952"
199187
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
200-
version = "0.3.1"
188+
version = "0.4.0"
201189

202190
[[deps.Zlib_jll]]
203191
deps = ["Libdl"]

Project.toml

100644100755
+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "BioRecordsProcessing"
22
uuid = "321bc2d7-9525-42af-8b23-17b2621d5ea8"
33
authors = ["Jonathan Bieler <[email protected]> and contributors"]
4-
version = "0.2.2"
4+
version = "0.2.3"
55

66
[deps]
77
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
@@ -15,7 +15,7 @@ BGZFStreams = "0.3"
1515
CodecZlib = "0.7"
1616
Glob = "1.3"
1717
julia = "1.7"
18-
XAM = "0.3, 0.4"
18+
XAM = "0.4"
1919
GenomicFeatures = "2, 2.1"
2020

2121
[extras]

README.md

100644100755
File mode changed.

docs/Project.toml

100644100755
-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@ BioRecordsProcessing = "321bc2d7-9525-42af-8b23-17b2621d5ea8"
33
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
44
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
55
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
6-
VariantCallFormat = "28eba6e3-a997-4ad9-87c6-d933b8bca6c1"
76
XAM = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"

docs/make.jl

100644100755
File mode changed.

docs/src/API.md

100644100755
File mode changed.

docs/src/examples.md

100644100755
File mode changed.

docs/src/index.md

100644100755
File mode changed.

src/BioRecordsProcessing.jl

100644100755
File mode changed.

src/Pipeline.jl

100644100755
File mode changed.

src/Processor.jl

100644100755
File mode changed.

src/RecordGrouper.jl

100644100755
+1-1
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,5 @@ end
5757
BAMPairedReadGrouper() = RecordGrouper{BAM.Record, String}(
5858
r -> BAM.tempname(r),
5959
records -> length(records) >= 2,
60-
r -> BAM.isprimary(r)
60+
r -> BAM.isprimaryalignment(r)
6161
)

src/Sink.jl

100644100755
File mode changed.

src/Source.jl

100644100755
+31-17
Original file line numberDiff line numberDiff line change
@@ -94,35 +94,43 @@ Reader(FASTX.FASTA, File("test.fa"))
9494
Reader(FASTX.FASTQ, Directory("data/", "*.fastq"))
9595
```
9696
"""
97-
struct Reader{F} <: AbstractSource where {F <: AbstractFileProvider}
97+
struct Reader{F,I} <: AbstractSource where {F <: AbstractFileProvider, I}
9898
record_module::Module
9999
file_provider::F
100+
index::I
100101

101-
Reader(record_module::Module, file_provider::F) where {F <: AbstractFileProvider} = new{F}(record_module, file_provider)
102+
Reader(record_module::Module, file_provider::F, index::I) where {F <: AbstractFileProvider, I} = new{F, I}(record_module, file_provider, index)
102103
end
103-
Reader(record_module::Module, filename::String) = Reader(record_module, File(filename))
104+
Reader(record_module::Module, file_provider::F; index=nothing) where {F <: AbstractFileProvider} = Reader(record_module, file_provider, index)
105+
Reader(record_module::Module, filename::String; index=nothing) = Reader(record_module, File(filename); index=index)
104106

105107
record_type(reader::Reader{F}) where {F} = reader.record_module
106108
_filename(reader::Reader) = _filename(reader.file_provider)
107109
is_paired(reader::Reader) = is_paired(reader.file_provider)
108110

109-
function open_reader(s::Reader, filepath, filename, extension)
111+
function open_reader(source::Reader, filepath, filename, extension)
110112

111-
RecordType = record_type(s)
113+
RecordType = record_type(source)
112114

113115
if extension == ".gz"
114116
reader = RecordType.Reader(GzipDecompressorStream(open(filepath)))
115117

116118
elseif extension == ".bam"
117-
index_file = filepath * ".bai"
118-
if !isfile(index_file)
119-
@warn "Index file not found : $index_file"
120-
reader = RecordType.Reader(open(filepath))
119+
120+
if !isnothing(source.index)
121+
reader = RecordType.Reader(open(filepath); index = source.index)
121122
else
122-
reader = RecordType.Reader(open(filepath); index = index_file)
123+
index_file = filepath * ".bai"
124+
if !isfile(index_file)
125+
@warn "Index file not found : $index_file"
126+
reader = RecordType.Reader(open(filepath))
127+
else
128+
reader = RecordType.Reader(open(filepath); index = index_file)
129+
end
123130
end
124-
if !isnothing(interval(s.file_provider))
125-
seek_region!(reader, interval(s.file_provider))
131+
132+
if !isnothing(interval(source.file_provider))
133+
seek_region!(reader, interval(source.file_provider))
126134
end
127135

128136
else
@@ -131,18 +139,24 @@ function open_reader(s::Reader, filepath, filename, extension)
131139
reader, RecordType
132140
end
133141

142+
143+
function Base.show(io::IO, source::Reader)
144+
print(io, "Reader($(source.record_module), $(source.file_provider))")
145+
end
146+
147+
134148
"""
135149
```julia
136-
Buffer(data::Vector{T}; filename = "")
150+
Buffer(data::T; filename = "")
137151
```
138152
139-
Use the array `data` as a source of records. An optional filename can be provided when a `Writer`
153+
Use the collection `data` as a source of records. An optional filename can be provided when a `Writer`
140154
is used as a sink.
141155
"""
142156
struct Buffer{T} <: AbstractSource
143-
data::Vector{T}
157+
data::T
144158
filename::String
145-
Buffer(data::Vector{T}; filename = "") where T = new{T}(data, filename)
159+
Buffer(data::T; filename = "") where T = new{T}(data, filename)
146160
end
147161

148162
function Base.show(io::IO, source::Buffer{T}) where T
@@ -159,7 +173,7 @@ function seek_region!(reader::BAM.Reader, region)
159173
refindex = findfirst(isequal(region.seqname), reader.refseqnames)
160174
refindex == nothing && throw(ArgumentError("sequence name $(iter.refname) is not found in the header"))
161175

162-
chunks = XAM.BAM.Indexes.overlapchunks(reader.index.index, refindex, region.first:region.last )
176+
chunks = XAM.BAM.Indexes.overlapchunks(reader.index.index, refindex, region.first:region.last)
163177
if !isempty(chunks)
164178
seek(reader, first(chunks).start)
165179
end

test/Project.toml

100644100755
-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
33
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
44
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
55
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
6-
VariantCallFormat = "28eba6e3-a997-4ad9-87c6-d933b8bca6c1"
76
XAM = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"

test/data/illumina_full_range_as_illumina.fastq

100644100755
File mode changed.

test/data/illumina_full_range_as_illumina.fastq.gz

100644100755
File mode changed.

test/data/illumina_full_range_as_illumina.processed.fastq

100644100755
File mode changed.

test/runtests.jl

100644100755
+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using BioRecordsProcessing
2-
using Test, FASTX, XAM, VariantCallFormat, BioSequences, FormatSpecimens
2+
using Test, FASTX, XAM, BioSequences, FormatSpecimens
3+
#VariantCallFormat
34

45
@testset "Internals" begin
56
@test BioRecordsProcessing.insert_suffix("name", ".fastq", ".processed") == "name.processed.fastq"

test/test_external_tool.jl

100644100755
File mode changed.

test/test_pipeline.jl

100644100755
+34-3
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@
8888
@test out[1] != out[2] # make sure Collect is copied
8989
end
9090

91+
#disable until https://github.com/rasmushenningsson/VariantCallFormat.jl/issues/5 is fixed
92+
if false
9193
@testset "VCF" begin
9294
mktempdir() do dir
9395
filepath = joinpath(path_of_format("VCF"), "adeno_virus.vcf")
@@ -117,9 +119,9 @@
117119
@test_throws ErrorException run(p)
118120
end
119121
end
122+
end
120123

121-
@testset "Buffer + Collect" begin
122-
124+
@testset "Buffer + Collect" begin
123125
input = rand(10)
124126
p = Pipeline(
125127
Buffer(input),
@@ -129,7 +131,18 @@
129131
@show p
130132
output = run(p)
131133
@test all(output .≈ 2*input)
132-
134+
end
135+
136+
@testset "Buffer + Collect with generator" begin
137+
input = (i for i in 1:10)
138+
p = Pipeline(
139+
Buffer(input),
140+
x -> 2x,
141+
Collect(Float64),
142+
)
143+
@show p
144+
output = run(p)
145+
@test all(output .≈ 2 .* input)
133146
end
134147

135148
@testset "Buffer + Writer" begin
@@ -281,6 +294,24 @@
281294
end
282295
end
283296

297+
@testset "BAM + Index" begin
298+
mktempdir() do dir
299+
spec = list_valid_specimens("BAM")
300+
bam = joinpath(path_of_format("BAM"), "SRR7993829_1.100K.forward.bam")
301+
index = BAM.BAI(bam * ".bai")
302+
303+
p = Pipeline(
304+
Reader(BAM, File(bam); index = index),
305+
Collect(BAM.Record)
306+
)
307+
@show p
308+
out = run(p)
309+
@test length(out) == 36405
310+
end
311+
end
312+
313+
314+
284315
@testset "BAM to paired FASTQ" begin
285316
mktempdir() do dir
286317
spec = list_valid_specimens("BAM")

0 commit comments

Comments
 (0)