#!/usr/bin/python
# Original Code
# <a href="http://github.com/turian/py80legsformat" title="http://github.com/turian/py80legsformat">http://github.com/turian/py80legsformat</a> by Joseph Turian
import csv
import hashlib
import os
import struct
import sys
import zipfile
from cStringIO import StringIO
from optparse import OptionParser
class EightyLegs:
def __init__(self, filename):
self.filename = filename
self.directory = filename.split('.')[0]
def read(self, file):
assert(struct.calcsize("i")) == 4
l = file.read(2*4)
(classID, versionID) = struct.unpack("ii", l)
assert (classID, versionID) == (218217067, 1)
l = "not EOF"
data = []
l = file.read(1*4)
while l != "":
(URLSIZE,) = struct.unpack("i", l)
url = file.read(URLSIZE).decode("utf-8")
l = file.read(1*4)
(DATASIZE,) = struct.unpack("i", l)
data = str(file.read(DATASIZE))
yield (url, data)
l = file.read(1*4)
def read_zip(self, file):
zip = zipfile.ZipFile(file, 'r')
for info in zip.infolist():
fname = info.filename
if fname.endswith('.80'):
data = zip.read(fname)
for r in self.read(StringIO(data)):
yield r
def parse(self):
if not os.path.exists(self.directory):
os.makedirs(self.directory)
tsv = csv.writer(open("%s.tsv" % self.directory, 'w'), delimiter='\t', lineterminator='\n')
if self.filename.endswith('.zip'):
e = self.read_zip(open(self.filename))
else:
e = self.read(open(self.filename))
for url, data in e:
print url
f = open("%s/%s.html" % (self.directory, hashlib.md5(url).hexdigest()), 'w')
f.write(data)
tsv.writerow([hashlib.md5(url).hexdigest(), url])
def main():
usage = 'Usage: %prog -f 19970_20966_a_1.zip or %prog -f 19970_20966_a_1.80'
parser = OptionParser(usage=usage)
parser.add_option('-f', '--file', dest='filename', help='input file')
(options, args) = parser.parse_args()
if options.filename is None:
parser.print_help()
sys.exit(0)
legs = EightyLegs(options.filename)
legs.parse()
if __name__ == '__main__':
main()