Skip to content

Commit ed170b7

Browse files
committed
Added warcscrape.py and supporting files.
Added warcscrape.py and supporting files.
1 parent c4895d5 commit ed170b7

10 files changed

+467
-378
lines changed

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
pytest
1+
nose

warc/__init__.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ def detect_format(filename):
1515
"""Tries to figure out the type of the file. Return 'warc' for
1616
WARC files and 'arc' for ARC files"""
1717

18-
if ".arc" in filename:
19-
return "arc"
20-
if ".warc" in filename:
18+
if filename.endswith(".warc") or filename.endswith(".warc.gz"):
2119
return "warc"
2220

2321
return "unknown"

warc/arc.py

+49-49
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Provides support for ARC v1 files.
2+
Provides support for ARC v1 files.
33
44
:copyright: (c) 2012 Internet Archive
55
"""
@@ -28,21 +28,21 @@ class ARCHeader(CaseInsensitiveDict):
2828
* content_type
2929
* length (length of the n/w doc in bytes)
3030
31-
V2 header fields are
31+
V2 header fields are
3232
3333
* url
3434
* ip_address
3535
* date (date of archival)
36-
* content_type
36+
* content_type
3737
* result_code (response code)
38-
* checksum
38+
* checksum
3939
* location
4040
* offset (offset from beginning of file to recrod)
4141
* filename (name of arc file)
4242
* length (length of the n/w doc in bytes)
4343
4444
"""
45-
def __init__(self, url = "", ip_address = "", date = "", content_type = "",
45+
def __init__(self, url = "", ip_address = "", date = "", content_type = "",
4646
result_code = "", checksum = "", location = "", offset = "", filename = "", length = "", version = 2):
4747

4848
if isinstance(date, datetime.datetime):
@@ -55,8 +55,8 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "",
5555

5656
self.version = version
5757

58-
CaseInsensitiveDict.__init__(self,
59-
url = url,
58+
CaseInsensitiveDict.__init__(self,
59+
url = url,
6060
ip_address = ip_address,
6161
date = date,
6262
content_type = content_type,
@@ -66,10 +66,10 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "",
6666
offset = offset,
6767
filename = filename,
6868
length = length)
69-
69+
7070
def write_to(self, f, version = None):
7171
"""
72-
Writes out the arc header to the file like object `f`.
72+
Writes out the arc header to the file like object `f`.
7373
7474
If the version field is 1, it writes out an arc v1 header,
7575
otherwise (and this is default), it outputs a v2 header.
@@ -93,44 +93,44 @@ def write_to(self, f, version = None):
9393
filename = self['filename'],
9494
length = self['length'])
9595
f.write(header)
96-
96+
9797

9898
@property
9999
def url(self):
100100
return self["url"]
101-
101+
102102
@property
103103
def ip_address(self):
104104
return self["ip_address"]
105-
105+
106106
@property
107107
def date(self):
108108
return datetime.datetime.strptime(self['date'], "%Y%m%d%H%M%S")
109-
109+
110110
@property
111111
def content_type(self):
112112
return self["content_type"]
113-
113+
114114
@property
115115
def result_code(self):
116116
return self["result_code"]
117-
117+
118118
@property
119119
def checksum (self):
120120
return self["checksum"]
121-
121+
122122
@property
123123
def location(self):
124124
return self["location"]
125-
125+
126126
@property
127127
def offset(self):
128128
return int(self["offset"])
129-
129+
130130
@property
131131
def filename(self):
132132
return self["filename"]
133-
133+
134134
@property
135135
def length(self):
136136
return int(self["length"])
@@ -139,7 +139,7 @@ def __str__(self):
139139
f = io.StringIO()
140140
self.write_to(f)
141141
return f.getvalue()
142-
142+
143143
def __repr__(self):
144144
f = {}
145145
for i in "url ip_address date content_typeresult_code checksum location offset filename length".split():
@@ -149,19 +149,19 @@ def __repr__(self):
149149
s = ", ".join(s)
150150
return "<ARCHeader(%s)>"%s
151151

152-
152+
153153
class ARCRecord(object):
154154
def __init__(self, header = None, payload = None, headers = {}, version = None):
155155
if not (header or headers):
156156
raise TypeError("Can't write create an ARC1 record without a header")
157157
self.header = header or ARCHeader(version = version, **headers)
158158
self.payload = payload
159159
self.version = version
160-
160+
161161
@classmethod
162162
def from_string(cls, string, version):
163163
"""
164-
Constructs an ARC record from a string and returns it.
164+
Constructs an ARC record from a string and returns it.
165165
166166
TODO: It might be best to merge this with the _read_arc_record
167167
function rather than reimplement the functionality here.
@@ -199,20 +199,20 @@ def __getitem__(self, name):
199199
def __setitem__(self, name, value):
200200
self.header[name] = value
201201

202-
202+
203203
def __str__(self):
204204
f = io.StringIO()
205205
self.write_to(f)
206206
return f.getvalue()
207-
208-
207+
208+
209209
class ARCFile(object):
210210
def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=False):
211211
"""
212212
Initialises a file like object that can be used to read or
213213
write Arc files. Works for both version 1 or version 2.
214214
215-
This can be called similar to the builtin `file` constructor.
215+
This can be called similar to the builtin `file` constructor.
216216
217217
It can also just be given a fileobj which is a file like
218218
object that it will use directly for its work.
@@ -224,7 +224,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
224224
225225
* ip_address - IP address of the machine doing the Archiving
226226
* date - Date of archival
227-
* org - Organisation that's doing the Archiving.
227+
* org - Organisation that's doing the Archiving.
228228
229229
The version parameter tries to work intuitively as follows
230230
@@ -248,7 +248,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
248248
* When we try to read a record, it will read out one
249249
record and try to guess the version from it (for
250250
the first read).
251-
251+
252252
"""
253253
if fileobj is None:
254254
fileobj = builtins.open(filename, mode or "rb")
@@ -259,7 +259,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
259259

260260
if compress:
261261
fileobj = gzip.open(fileobj, mode)
262-
262+
263263
self.fileobj = fileobj
264264

265265
self.filename = filename
@@ -282,7 +282,7 @@ def __enter__(self):
282282

283283
def __exit__(self, exc_type, exc_value, traceback):
284284
self.close()
285-
285+
286286
def _write_header(self):
287287
"Writes out an ARC header"
288288
if "org" not in self.file_headers:
@@ -301,15 +301,15 @@ def _write_header(self):
301301
payload = "2 0 %(org)s\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length"
302302
else:
303303
raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version)
304-
304+
305305
fname = os.path.basename(self.filename)
306306
header = ARCHeader(url = "filedesc://%s"%fname,
307-
ip_address = self.file_headers['ip_address'],
307+
ip_address = self.file_headers['ip_address'],
308308
date = self.file_headers['date'],
309-
content_type = "text/plain",
309+
content_type = "text/plain",
310310
length = len(payload),
311311
result_code = "200",
312-
checksum = "-",
312+
checksum = "-",
313313
location = "-",
314314
offset = str(self.fileobj.tell()),
315315
filename = fname)
@@ -340,7 +340,7 @@ def _read_file_header(self):
340340
# print "--------------------------------------------------"
341341
if self.version and int(self.version) != version:
342342
raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version))
343-
343+
344344
if version == '1':
345345
url, ip_address, date, content_type, length = header.split()
346346
self.file_headers = {"ip_address" : ip_address,
@@ -404,13 +404,13 @@ def _read_arc_record(self):
404404
self.fileobj.readline() # Munge the separator newline.
405405

406406
return ARCRecord(header = arc_header, payload = payload)
407-
407+
408408
def read(self):
409409
"Reads out an arc record from the file"
410410
if not self.header_read:
411411
self._read_file_header()
412412
return self._read_arc_record()
413-
413+
414414
# For compatability with WARCFile
415415
read_record = read
416416
write_record = write
@@ -420,16 +420,16 @@ def __iter__(self):
420420
while record:
421421
yield record
422422
record = self.read()
423-
423+
424424
def close(self):
425425
self.fileobj.close()
426-
427-
428-
429-
430-
431-
432-
433-
434-
435-
426+
427+
428+
429+
430+
431+
432+
433+
434+
435+

0 commit comments

Comments
 (0)