IEWebArchive

Reading IE-style web archives (.mht files).

Sources

MHTDocument.py

from Cocoa import NSDocument
import objc
from loader import MHTLoader

class MHTDocument (NSDocument):
    locationbox = objc.IBOutlet()
    webview = objc.IBOutlet()

    path = None
    statusText = None

    @objc.IBAction
    def navigateHistory_(self, sender):
        if sender.selectedSegment() == 0:
            self.webview.goBack_(sender)
        else:
            self.webview.goForward_(sender)

    def windowNibName(self):
        return "MHTDocument"

    def readFromFile_ofType_(self, path, tp):
        if self.webview is None:
            self.path = path
        else:
            self.readMHT_(path)

        return True

    def writeToFile_ofType_(self, path, tp):
        # TODO: "save-as" functionality
        return False

    def windowControllerDidLoadNib_(self, controller):
        if self.path:
            self.readMHT_(self.path)

    def readMHT_(self, path):
        self.mht = MHTLoader(path)
        self.locationbox.setStringValue_(self.mht.fixupURL(self.mht.root))
        archive = self.mht.asWebArchive()
        print("Archive", archive.description())
        with open("/tmp/archive.webarchive", "wb") as fp:
            fp.write(archive.data().bytes())
        self.webview.mainFrame().stopLoading()
        self.webview.mainFrame().loadArchive_(archive)
        1/0

loader.py

import email, urllib

from WebKit import WebResource, WebArchive
from Cocoa import NSData, NSString, NSURL

def loadMHT(filename):
    """
    Load a .HMT HTML archive and return the WebArchive representation.
    """
    return HMTLoad(filename).asWebArchive()


class MHTLoader (object):
    """
    A loader for .mht files, and archive format used by MS Internet Explorer
    on Windows.
    """

    def __init__(self, filename):
        self.filename = filename

        # root of the archive (index into self.parts)
        self.root = None

        # filename -> (content-type, data)
        self.parts = {}

        self.loadFile(filename)

    def loadFile(self, filename):
        with open(filename, "r") as fp:
            msg = email.message_from_file(fp)

        for part in msg.walk():
            if part.get_content_maintype() == "multipart":
                continue

            filename = part.get("Content-Location")
            contentType = part.get_content_type()
            data = part.get_payload(decode=True)

            self.parts[filename] = (contentType, data)
            if self.root is None:
                self.root = filename

    def fixupURL(self, url):
        # IE creates MHT files with file: URLS containing backslashes,
        # NSURL insists that those are invalid, replace backslashes by
        # forward slashes.
        if url.startswith("file:"):
            return url.replace("\\", "/")
        else:
            return url

    def asWebArchive(self):
        """
        Convert the MHT archive to a webarchive.
        """
        rootType, rootText = self.parts[self.root]
        pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
            NSData.dataWithBytes_length_(rootText.replace(b"\\", b"/"), len(rootText)),
            NSURL.URLWithString_(self.fixupURL(self.root)),
            NSString.stringWithString_(rootType),
            None,
            None)

        resources = []
        for url in self.parts:
            if url == self.root:
                continue

            tp, data = self.parts[url]
            resources.append(WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
                NSData.dataWithBytes_length_(data, len(data)),
                NSURL.URLWithString_(self.fixupURL(url)),
                NSString.stringWithString_(tp),
                None,
                None))

        return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
            pageResource, resources, None)


def main():
    # Testing...
    p = MHTLoader("python-home.mht")
    a = p.asWebArchive()
    d = a.data()
    with open("python-home.webarchive", "wb") as fp:
        fp.write(a.data().bytes())

if __name__ == "__main__":
    main()

main.py

import objc; objc.setVerbose(1)
import MHTDocument
from PyObjCTools import AppHelper

AppHelper.runEventLoop()

setup.py

"""
Script for building the example.

Usage:
    python3 setup.py py2app
"""
from setuptools import setup

plist = dict(
    CFBundleDocumentTypes = [
        dict(
            CFBundleTypeExtensions=["mht"],
            CFBundleTypeName="Internet Explorer Web Archive",
            CFBundleTypeRole="Editor",
            NSDocumentClass="MHTDocument",
        ),
    ]
)

setup(
    name="MHTViewer",
    app=["main.py"],
    data_files=["MainMenu.nib", "MHTDocument.nib"],
    options=dict(py2app=dict(plist=plist)),
    setup_requires=[
        "py2app",
        "pyobjc-framework-Cocoa",
        "pyobjc-framework-WebKit",
    ]
)

Resources