Skip to content

Commit 4f55f41

Browse files
committed
Handle multiline header fields
1 parent 3884e63 commit 4f55f41

File tree

3 files changed

+45
-17
lines changed

3 files changed

+45
-17
lines changed

src/bin/main.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@ fn main() {
88
let warc = WarcReader::new(handle);
99

1010
let mut response_counter = 0;
11+
let mut response_size = 0;
12+
1113
for item in warc {
1214
let record = item.unwrap(); // could be IO/malformed error
1315

1416
// header names are case insensitive
1517
if record.header.get(&"WARC-Type".into()) == Some(&"response".into()) {
1618
response_counter += 1;
19+
response_size += record.content.len();
1720
}
1821
}
1922

20-
println!("# response records: {}", response_counter);
23+
println!("response records: {}", response_counter);
24+
println!("response size: {} MiB", response_size >> 20);
2125
}

src/lib.rs

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
//! A high performance Web Archive (WARC) file parser
22
//!
3+
//! The WarcReader iterates over [WarcRecords](WarcRecord) from a [BufRead] input.
4+
//!
5+
//! Perfomance should be quite good, about ~500MiB/s on a single CPU core.
6+
//!
37
//! ## Usage
48
//!
59
//! ```rust
610
//! use rust_warc::WarcReader;
711
//!
8-
//! use std::io;
9-
//!
1012
//! fn main() {
11-
//! let stdin = io::stdin();
13+
//! // we're taking input from stdin here, but any BufRead will do
14+
//! let stdin = std::io::stdin();
1215
//! let handle = stdin.lock();
16+
//!
1317
//! let mut warc = WarcReader::new(handle);
1418
//!
1519
//! let mut response_counter = 0;
@@ -94,7 +98,8 @@ impl Into<String> for CaseString {
9498
/// WARC/1.1
9599
/// WARC-Type: warcinfo
96100
/// WARC-Date: 2006-09-19T17:20:14Z
97-
/// WARC-Record-ID: <urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39>
101+
/// WARC-Record-ID: multiline
102+
/// uuid value
98103
/// Content-Type: text/plain
99104
/// Content-Length: 4
100105
///
@@ -110,6 +115,8 @@ impl Into<String> for CaseString {
110115
///
111116
/// // header names are case insensitive
112117
/// assert_eq!(item.header.get(&"content-type".into()), Some(&"text/plain".into()));
118+
/// // and may span multiple lines
119+
/// assert_eq!(item.header.get(&"Warc-Record-ID".into()), Some(&"multiline\nuuid value".into()));
113120
///
114121
/// assert_eq!(item.content, "test".as_bytes());
115122
/// ```
@@ -142,33 +149,49 @@ impl WarcRecord {
142149

143150
let mut header = HashMap::<CaseString, String>::with_capacity(16); // no allocations if <= 16 header fields
144151

152+
let mut continuation: Option<(CaseString, String)> = None;
145153
loop {
146154
let mut line_buf = String::new();
147155

148156
if let Err(io) = read.read_line(&mut line_buf) {
149157
return Err(WarcError::IO(io));
150158
}
151159

152-
// leniency: allow absent carriage return
153-
if &line_buf == "\r\n" || &line_buf == "\n" {
160+
if &line_buf == "\r\n" {
154161
break;
155162
}
156163

157-
// todo field multiline continuations
158-
159164
rtrim(&mut line_buf);
160165

161-
if let Some(semi) = line_buf.find(':') {
162-
let value = line_buf.split_off(semi + 1).trim().to_string();
163-
line_buf.pop(); // eat colon
164-
rtrim(&mut line_buf);
165-
166-
header.insert(line_buf.into(), value);
166+
if line_buf.starts_with(' ') || line_buf.starts_with('\t') {
167+
if let Some(keyval) = &mut continuation {
168+
keyval.1.push('\n');
169+
keyval.1.push_str(line_buf.trim());
170+
} else {
171+
return Err(WarcError::Malformed(String::from("Invalid header block")));
172+
}
167173
} else {
168-
return Err(WarcError::Malformed(String::from("Invalid header field")));
174+
if let Some((key, value)) = std::mem::replace(&mut continuation, None) {
175+
header.insert(key, value);
176+
}
177+
178+
if let Some(semi) = line_buf.find(':') {
179+
let value = line_buf.split_off(semi + 1).trim().to_string();
180+
line_buf.pop(); // eat colon
181+
rtrim(&mut line_buf);
182+
183+
continuation = Some((line_buf.into(), value));
184+
} else {
185+
return Err(WarcError::Malformed(String::from("Invalid header field")));
186+
}
169187
}
170188
}
171189

190+
// insert leftover continuation
191+
if let Some((key, value)) = continuation {
192+
header.insert(key, value);
193+
}
194+
172195
let content_len = header.get(&"Content-Length".into());
173196
if content_len.is_none() {
174197
return Err(WarcError::Malformed(String::from(

src/test.warc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
WARC/1.1
22
WARC-Type: warcinfo
33
WARC-Date: 2006-09-19T17:20:14Z
4-
WARC-Record-ID: <urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39>
4+
WARC-Record-ID: multiline
5+
uuid value
56
Content-Type: text/plain
67
Content-Length: 4
78

0 commit comments

Comments
 (0)