1
1
//! A high performance Web Archive (WARC) file parser
2
2
//!
3
+ //! The WarcReader iterates over [WarcRecords](WarcRecord) from a [BufRead] input.
4
+ //!
5
+ //! Perfomance should be quite good, about ~500MiB/s on a single CPU core.
6
+ //!
3
7
//! ## Usage
4
8
//!
5
9
//! ```rust
6
10
//! use rust_warc::WarcReader;
7
11
//!
8
- //! use std::io;
9
- //!
10
12
//! fn main() {
11
- //! let stdin = io::stdin();
13
+ //! // we're taking input from stdin here, but any BufRead will do
14
+ //! let stdin = std::io::stdin();
12
15
//! let handle = stdin.lock();
16
+ //!
13
17
//! let mut warc = WarcReader::new(handle);
14
18
//!
15
19
//! let mut response_counter = 0;
@@ -94,7 +98,8 @@ impl Into<String> for CaseString {
94
98
/// WARC/1.1
95
99
/// WARC-Type: warcinfo
96
100
/// WARC-Date: 2006-09-19T17:20:14Z
97
- /// WARC-Record-ID: <urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39>
101
+ /// WARC-Record-ID: multiline
102
+ /// uuid value
98
103
/// Content-Type: text/plain
99
104
/// Content-Length: 4
100
105
///
@@ -110,6 +115,8 @@ impl Into<String> for CaseString {
110
115
///
111
116
/// // header names are case insensitive
112
117
/// assert_eq!(item.header.get(&"content-type".into()), Some(&"text/plain".into()));
118
+ /// // and may span multiple lines
119
+ /// assert_eq!(item.header.get(&"Warc-Record-ID".into()), Some(&"multiline\nuuid value".into()));
113
120
///
114
121
/// assert_eq!(item.content, "test".as_bytes());
115
122
/// ```
@@ -142,33 +149,49 @@ impl WarcRecord {
142
149
143
150
let mut header = HashMap :: < CaseString , String > :: with_capacity ( 16 ) ; // no allocations if <= 16 header fields
144
151
152
+ let mut continuation: Option < ( CaseString , String ) > = None ;
145
153
loop {
146
154
let mut line_buf = String :: new ( ) ;
147
155
148
156
if let Err ( io) = read. read_line ( & mut line_buf) {
149
157
return Err ( WarcError :: IO ( io) ) ;
150
158
}
151
159
152
- // leniency: allow absent carriage return
153
- if & line_buf == "\r \n " || & line_buf == "\n " {
160
+ if & line_buf == "\r \n " {
154
161
break ;
155
162
}
156
163
157
- // todo field multiline continuations
158
-
159
164
rtrim ( & mut line_buf) ;
160
165
161
- if let Some ( semi) = line_buf. find ( ':' ) {
162
- let value = line_buf. split_off ( semi + 1 ) . trim ( ) . to_string ( ) ;
163
- line_buf. pop ( ) ; // eat colon
164
- rtrim ( & mut line_buf) ;
165
-
166
- header. insert ( line_buf. into ( ) , value) ;
166
+ if line_buf. starts_with ( ' ' ) || line_buf. starts_with ( '\t' ) {
167
+ if let Some ( keyval) = & mut continuation {
168
+ keyval. 1 . push ( '\n' ) ;
169
+ keyval. 1 . push_str ( line_buf. trim ( ) ) ;
170
+ } else {
171
+ return Err ( WarcError :: Malformed ( String :: from ( "Invalid header block" ) ) ) ;
172
+ }
167
173
} else {
168
- return Err ( WarcError :: Malformed ( String :: from ( "Invalid header field" ) ) ) ;
174
+ if let Some ( ( key, value) ) = std:: mem:: replace ( & mut continuation, None ) {
175
+ header. insert ( key, value) ;
176
+ }
177
+
178
+ if let Some ( semi) = line_buf. find ( ':' ) {
179
+ let value = line_buf. split_off ( semi + 1 ) . trim ( ) . to_string ( ) ;
180
+ line_buf. pop ( ) ; // eat colon
181
+ rtrim ( & mut line_buf) ;
182
+
183
+ continuation = Some ( ( line_buf. into ( ) , value) ) ;
184
+ } else {
185
+ return Err ( WarcError :: Malformed ( String :: from ( "Invalid header field" ) ) ) ;
186
+ }
169
187
}
170
188
}
171
189
190
+ // insert leftover continuation
191
+ if let Some ( ( key, value) ) = continuation {
192
+ header. insert ( key, value) ;
193
+ }
194
+
172
195
let content_len = header. get ( & "Content-Length" . into ( ) ) ;
173
196
if content_len. is_none ( ) {
174
197
return Err ( WarcError :: Malformed ( String :: from (
0 commit comments