22
Jun

80Legs file extractor


#!/usr/bin/python
 
# Original Code
# <a href="http://github.com/turian/py80legsformat" title="http://github.com/turian/py80legsformat">http://github.com/turian/py80legsformat</a> by Joseph Turian
 
import csv
import hashlib
import os
import struct
import sys
import zipfile
from cStringIO import StringIO
from optparse import OptionParser
 
class EightyLegs:
    def __init__(self, filename):
        self.filename = filename
        self.directory = filename.split('.')[0]
 
    def read(self, file):
        assert(struct.calcsize("i")) == 4
 
        l = file.read(2*4)
        (classID, versionID) = struct.unpack("ii", l)
        assert (classID, versionID) == (218217067, 1)
 
        l = "not EOF"
        data = []
        l = file.read(1*4)
        while l != "":
            (URLSIZE,) = struct.unpack("i", l)
            url = file.read(URLSIZE).decode("utf-8")
            l = file.read(1*4)
            (DATASIZE,) = struct.unpack("i", l)
            data = str(file.read(DATASIZE))
            yield (url, data)
            l = file.read(1*4)
 
    def read_zip(self, file):
        zip = zipfile.ZipFile(file, 'r')
        for info in zip.infolist():
            fname = info.filename            
            if fname.endswith('.80'):
                data = zip.read(fname)
                for r in self.read(StringIO(data)):
                    yield r
 
    def parse(self):
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
 
        tsv = csv.writer(open("%s.tsv" % self.directory, 'w'), delimiter='\t', lineterminator='\n')
 
        if self.filename.endswith('.zip'):
            e = self.read_zip(open(self.filename))
        else:
            e = self.read(open(self.filename))
 
        for url, data in e:
            print url
            f = open("%s/%s.html" % (self.directory, hashlib.md5(url).hexdigest()), 'w')
            f.write(data)
            tsv.writerow([hashlib.md5(url).hexdigest(), url])
 
def main():
    usage = 'Usage: %prog -f 19970_20966_a_1.zip or %prog -f 19970_20966_a_1.80'
    parser = OptionParser(usage=usage)
    parser.add_option('-f', '--file', dest='filename', help='input file')
 
    (options, args) = parser.parse_args()
 
    if options.filename is None:
        parser.print_help()
        sys.exit(0)
 
    legs = EightyLegs(options.filename)
    legs.parse()
 
if __name__ == '__main__':
    main()

I had issues with the original version where it would break with some scrapes. Hoping to pass the code on to the original author so it can be merged back.

Comments

Post new comment

  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>
  • Lines and paragraphs break automatically.
  • You can enable syntax highlighting of source code with the following tags: <code>, <blockcode>, <apache>, <c>, <cpp>, <drupal5>, <drupal6>, <java>, <javascript>, <php>, <python>, <ruby>. The supported tag styles are: <foo>, [foo].

More information about formatting options