Source code for bridgedb.parse.descriptors

# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_parse_descriptors ; -*-
#_____________________________________________________________________________
#
# This file is part of BridgeDB, a Tor bridge distribution system.
#
# :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
#           please also see AUTHORS file
# :copyright: (c) 2007-2015, The Tor Project, Inc.
#             (c) 2014-2015, Isis Lovecruft
# :license: see LICENSE for licensing information
#_____________________________________________________________________________

"""Parsers for Tor Bridge descriptors, including ``bridge-networkstatus``
documents, ``bridge-server-descriptor``s, and ``bridge-extrainfo``
descriptors.

.. py:module:: bridgedb.parse.descriptors
    :synopsis: Parsers for Tor Bridge descriptors.

bridgedb.parse.descriptors
===========================
::

 DescriptorWarning - Raised when we parse a very odd descriptor.
 deduplicate - Deduplicate a container of descriptors, keeping only the newest
               descriptor for each router.
 parseNetworkStatusFile - Parse a bridge-networkstatus document generated and
                          given to us by the BridgeAuthority.
 parseServerDescriptorsFile - Parse a file containing
                              bridge-server-descriptors.
 parseExtraInfoFiles - Parse (multiple) file(s) containing bridge-extrainfo
                       descriptors.
..
"""

from __future__ import print_function

import datetime
import logging
import os
import shutil

from stem import ProtocolError
from stem.descriptor import parse_file
from stem.descriptor.router_status_entry import _parse_file as _parseNSFile
from stem.descriptor.router_status_entry import RouterStatusEntryV3

from bridgedb import safelog
from bridgedb.parse.nickname import InvalidRouterNickname


[docs]class DescriptorWarning(Warning):
    """Raised when we parse a very odd descriptor."""


def _copyUnparseableDescriptorFile(filename):
    """Save a copy of the bad descriptor file for later debugging.

    If the old filename was ``'descriptors/cached-extrainfo.new'``, then the
    name of the copy will be something like
    ``'descriptors/2014-11-05-01:57:23_cached-extrainfo.new.unparseable'``.

    :param str filename: The path to the unparseable descriptor file that we
        should save a copy of.
    :rtype: bool
    :returns: ``True`` if a copy of the file was saved successfully, and
        ``False`` otherwise.
    """
    timestamp = datetime.datetime.now()
    timestamp = timestamp.isoformat(sep=chr(0x2d))
    timestamp = timestamp.rsplit('.', 1)[0]

    path, sep, fname = filename.rpartition(os.path.sep)
    newfilename = "%s%s%s_%s%sunparseable" % (path, sep, timestamp,
                                              fname, os.path.extsep)

    logging.info(("Unparseable descriptor file '%s' will be copied to '%s' "
                  "for debugging.") % (filename, newfilename))

    try:
        shutil.copyfile(filename, newfilename)
    except Exception as error:  # pragma: no cover
        logging.error(("Could not save copy of unparseable descriptor file "
                       "in '%s': %s") % (newfilename, str(error)))
        return False
    else:
        logging.debug(("Successfully finished saving a copy of an unparseable "
                       "descriptor file."))
        return True

[docs]def parseNetworkStatusFile(filename, validate=True, skipAnnotations=True,
                           descriptorClass=RouterStatusEntryV3):
    """Parse a file which contains an ``@type bridge-networkstatus`` document.

    See :trac:`12254` for why networkstatus-bridges documents don't look
    anything like the networkstatus v2 documents that they are purported to
    look like. They are missing all headers, and the entire footer (including
    authority signatures).

    :param str filename: The location of the file containing bridge
        networkstatus descriptors.
    :param bool validate: Passed along to Stem's parsers. If ``True``, the
        descriptors will raise exceptions if they do not meet some definition
        of correctness.
    :param bool skipAnnotations: If ``True``, skip parsing everything before the
        first ``r`` line.
    :param descriptorClass: A class (probably from
        :mod:`stem.descriptors.router_status_entry`, i.e.
        :class:`stem.descriptor.router_status_entry.RouterStatusEntryV2` or
        :class:`stem.descriptor.router_status_entry.RouterStatusEntryV3`)
        which Stem will parse each descriptor it reads from **filename** into.
    :raises InvalidRouterNickname: if one of the routers in the networkstatus
        file had a nickname which does not conform to Tor's nickname
        specification.
    :raises ValueError: if the contents of a descriptor are malformed and
        **validate** is ``True``.
    :raises IOError: if the file at **filename** can't be read.
    :rtype: list
    :returns: A list of
        :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
    """
    routers = []

    logging.info("Parsing networkstatus file: %s" % filename)
    with open(filename) as fh:
        position = fh.tell()
        if skipAnnotations:
            while not fh.readline().startswith('r '):
                position = fh.tell()
        logging.debug("Skipping %d bytes of networkstatus file." % position)
        fh.seek(position)
        document = _parseNSFile(fh, validate, entry_class=descriptorClass)

        try:
            routers.extend(list(document))
        except ValueError as error:
            if "nickname isn't valid" in str(error):
                raise InvalidRouterNickname(str(error))
            else:
                raise ValueError(str(error))

    logging.info("Closed networkstatus file: %s" % filename)

    return routers

[docs]def parseServerDescriptorsFile(filename, validate=True):
    """Open and parse **filename**, which should contain
    ``@type bridge-server-descriptor``.

    .. note:: We have to lie to Stem, pretending that these are
        ``@type server-descriptor``, **not**
        ``@type bridge-server-descriptor``. See :trac:`11257`.

    :param str filename: The file to parse descriptors from.
    :param bool validate: Whether or not to validate descriptor
        contents. (default: ``True``)
    :rtype: list
    :returns: A list of
        :class:`stem.descriptor.server_descriptor.RelayDescriptor`s.
    """
    logging.info("Parsing server descriptors with Stem: %s" % filename)
    descriptorType = 'server-descriptor 1.0'
    document = parse_file(filename, descriptorType, validate=validate)
    routers = list(document)
    return routers

def __cmp_published__(x, y):
    """A custom ``cmp()`` which sorts descriptors by published date.

    :rtype: int
    :returns: Return negative if x<y, zero if x==y, positive if x>y.
    """
    if x.published < y.published:
        return -1
    elif x.published == y.published:
        # This *shouldn't* happen. It would mean that two descriptors for
        # the same router had the same timestamps, probably meaning there
        # is a severely-messed up OR implementation out there. Let's log
        # its fingerprint (no matter what!) so that we can look up its
        # ``platform`` line in its server-descriptor and tell whoever
        # wrote that code that they're probably (D)DOSing the Tor network.
        logging.warn(("Duplicate descriptor with identical timestamp (%s) "
                      "for bridge %s with fingerprint %s !") %
                     (x.published, x.nickname, x.fingerprint))
        return 0
    elif x.published > y.published:
        return 1

[docs]def deduplicate(descriptors, statistics=False):
    """Deduplicate some descriptors, returning only the newest for each router.

    .. note:: If two descriptors for the same router are discovered, AND both
        descriptors have the **same** published timestamp, then the router's
        fingerprint WILL BE LOGGED ON PURPOSE, because we assume that router
        to be broken or malicious.

    :param list descriptors: A list of
        :class:`stem.descriptor.server_descriptor.RelayDescriptor`,
        :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`,
        or :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
    :param bool statistics: If ``True``, log some extra statistics about the
        number of duplicates.
    :rtype: dict
    :returns: A dictionary mapping router fingerprints to their newest
        available descriptor.
    """
    duplicates = {}
    newest = {}

    for descriptor in descriptors:
        fingerprint = descriptor.fingerprint
        if fingerprint in duplicates:
            duplicates[fingerprint].append(descriptor)
        else:
            duplicates[fingerprint] = [descriptor,]

    for fingerprint, dupes in duplicates.items():
        dupes.sort(cmp=__cmp_published__)
        first = dupes.pop()
        newest[fingerprint] = first
        duplicates[fingerprint] = dupes

    if statistics:
        # sorted() won't sort by values (or anything that isn't the first item
        # in its container), period, no matter what the cmp function is.
        totals  = sorted([(len(v), k,) for k, v in duplicates.viewitems()])
        total   = sum([k for (k, v) in totals])
        bridges = len(duplicates)
        top     = 10 if bridges >= 10 else bridges
        logging.info("Number of bridges with duplicates: %5d" % bridges)
        logging.info("Total duplicate descriptors:       %5d" % total)
        logging.info("Bridges with the most duplicates (Top %d):" % top)
        for i, (subtotal, bridge) in zip(range(1, top + 1), totals[:top]):
            logging.info("  #%d %s: %d duplicates" % (i, bridge, subtotal))

    logging.info("Descriptor deduplication finished.")

    return newest

[docs]def parseExtraInfoFiles(*filenames, **kwargs):
    """Open **filenames** and parse any ``@type bridge-extrainfo-descriptor``
    contained within.

    .. warning:: This function will *not* check that the ``router-signature``
        at the end of the extrainfo descriptor is valid. See
        ``bridgedb.bridges.Bridge._verifyExtraInfoSignature`` for a method for
        checking the signature.  The signature cannot be checked here, because
        to do so, we would need the latest, valid, corresponding
        ``signing-key`` for the Bridge.

    .. note:: This function will call :func:`deduplicate` to deduplicate the
        extrainfo descriptors parsed from all **filenames**.

    :kwargs validate: If there is a ``'validate'`` keyword argument, its value
        will be passed along as the ``'validate'`` argument to
        :class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`.
        The ``'validate'`` keyword argument defaults to ``True``, meaning that
        the hash digest stored in the ``router-digest`` line will be checked
        against the actual contents of the descriptor and the extrainfo
        document's signature will be verified.
    :rtype: dict
    :returns: A dictionary mapping bridge fingerprints to their corresponding,
        deduplicated
        :class:`stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor`.
    """
    descriptors = []

    # The ``stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor``
    # class (with ``descriptorType = 'bridge-extra-info 1.1``) is unsuitable
    # for our purposes for the following reasons:
    #
    #   1. It expects a ``router-digest`` line, which is only present in
    #      sanitised bridge extrainfo descriptors.
    #
    #   2. It doesn't check the ``router-signature`` (nor does it expect there
    #      to be a signature).
    descriptorType = 'extra-info 1.0'

    validate = True
    if ('validate' in kwargs) and (kwargs['validate'] is False):
        validate = False

    for filename in filenames:
        logging.info("Parsing %s descriptors in %s..."
                     % (descriptorType, filename))

        document = parse_file(filename, descriptorType, validate=validate)

        try:
            for router in document:
                descriptors.append(router)
        except (ValueError, ProtocolError) as error:
            logging.error(
                ("Stem exception while parsing extrainfo descriptor from "
                 "file '%s':\n%s") % (filename, str(error)))
            _copyUnparseableDescriptorFile(filename)

    routers = deduplicate(descriptors)
    return routers
Table Of Contents

Table Of Contents

Source code for bridgedb.parse.descriptors

Table Of Contents

Quick search

Table Of Contents

Quick search

Source code for bridgedb.parse.descriptors