22
Jun
0
80Legs file extractor
#!/usr/bin/python # Original Code # <a href="http://github.com/turian/py80legsformat" title="http://github.com/turian/py80legsformat">http://github.com/turian/py80legsformat</a> by Joseph Turian import csv import hashlib import os import struct import sys import zipfile from cStringIO import StringIO from optparse import OptionParser class EightyLegs: def __init__(self, filename): self.filename = filename self.directory = filename.split('.')[0] def read(self, file): assert(struct.calcsize("i")) == 4 l = file.read(2*4) (classID, versionID) = struct.unpack("ii", l) assert (classID, versionID) == (218217067, 1) l = "not EOF" data = [] l = file.read(1*4) while l != "": (URLSIZE,) = struct.unpack("i", l) url = file.read(URLSIZE).decode("utf-8") l = file.read(1*4) (DATASIZE,) = struct.unpack("i", l) data = str(file.read(DATASIZE)) yield (url, data) l = file.read(1*4) def read_zip(self, file): zip = zipfile.ZipFile(file, 'r') for info in zip.infolist(): fname = info.filename if fname.endswith('.80'): data = zip.read(fname) for r in self.read(StringIO(data)): yield r def parse(self): if not os.path.exists(self.directory): os.makedirs(self.directory) tsv = csv.writer(open("%s.tsv" % self.directory, 'w'), delimiter='\t', lineterminator='\n') if self.filename.endswith('.zip'): e = self.read_zip(open(self.filename)) else: e = self.read(open(self.filename)) for url, data in e: print url f = open("%s/%s.html" % (self.directory, hashlib.md5(url).hexdigest()), 'w') f.write(data) tsv.writerow([hashlib.md5(url).hexdigest(), url]) def main(): usage = 'Usage: %prog -f 19970_20966_a_1.zip or %prog -f 19970_20966_a_1.80' parser = OptionParser(usage=usage) parser.add_option('-f', '--file', dest='filename', help='input file') (options, args) = parser.parse_args() if options.filename is None: parser.print_help() sys.exit(0) legs = EightyLegs(options.filename) legs.parse() if __name__ == '__main__': main()
I had issues with the original version where it would break with some scrapes. Hoping to pass the code on to the original author so it can be merged back.










