# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_parse_descriptors ; -*-
#_____________________________________________________________________________
#
# This file is part of BridgeDB, a Tor bridge distribution system.
#
# :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis@torproject.org>
# please also see AUTHORS file
# :copyright: (c) 2007-2015, The Tor Project, Inc.
# (c) 2014-2015, Isis Lovecruft
# :license: see LICENSE for licensing information
#_____________________________________________________________________________
"""Parsers for Tor Bridge descriptors, including ``bridge-networkstatus``
documents, ``bridge-server-descriptor``s, and ``bridge-extrainfo``
descriptors.
.. py:module:: bridgedb.parse.descriptors
:synopsis: Parsers for Tor Bridge descriptors.
bridgedb.parse.descriptors
===========================
::
DescriptorWarning - Raised when we parse a very odd descriptor.
deduplicate - Deduplicate a container of descriptors, keeping only the newest
descriptor for each router.
parseNetworkStatusFile - Parse a bridge-networkstatus document generated and
given to us by the BridgeAuthority.
parseServerDescriptorsFile - Parse a file containing
bridge-server-descriptors.
parseExtraInfoFiles - Parse (multiple) file(s) containing bridge-extrainfo
descriptors.
..
"""
from __future__ import print_function
import datetime
import logging
import os
import shutil
from stem import ProtocolError
from stem.descriptor import parse_file
from stem.descriptor.router_status_entry import _parse_file as _parseNSFile
from stem.descriptor.router_status_entry import RouterStatusEntryV3
from bridgedb import safelog
from bridgedb.parse.nickname import InvalidRouterNickname
[docs]class DescriptorWarning(Warning):
"""Raised when we parse a very odd descriptor."""
def _copyUnparseableDescriptorFile(filename):
"""Save a copy of the bad descriptor file for later debugging.
If the old filename was ``'descriptors/cached-extrainfo.new'``, then the
name of the copy will be something like
``'descriptors/2014-11-05-01:57:23_cached-extrainfo.new.unparseable'``.
:param str filename: The path to the unparseable descriptor file that we
should save a copy of.
:rtype: bool
:returns: ``True`` if a copy of the file was saved successfully, and
``False`` otherwise.
"""
timestamp = datetime.datetime.now()
timestamp = timestamp.isoformat(sep=chr(0x2d))
timestamp = timestamp.rsplit('.', 1)[0]
path, sep, fname = filename.rpartition(os.path.sep)
newfilename = "%s%s%s_%s%sunparseable" % (path, sep, timestamp,
fname, os.path.extsep)
logging.info(("Unparseable descriptor file '%s' will be copied to '%s' "
"for debugging.") % (filename, newfilename))
try:
shutil.copyfile(filename, newfilename)
except Exception as error: # pragma: no cover
logging.error(("Could not save copy of unparseable descriptor file "
"in '%s': %s") % (newfilename, str(error)))
return False
else:
logging.debug(("Successfully finished saving a copy of an unparseable "
"descriptor file."))
return True
[docs]def parseNetworkStatusFile(filename, validate=True, skipAnnotations=True,
descriptorClass=RouterStatusEntryV3):
"""Parse a file which contains an ``@type bridge-networkstatus`` document.
See :trac:`12254` for why networkstatus-bridges documents don't look
anything like the networkstatus v2 documents that they are purported to
look like. They are missing all headers, and the entire footer (including
authority signatures).
:param str filename: The location of the file containing bridge
networkstatus descriptors.
:param bool validate: Passed along to Stem's parsers. If ``True``, the
descriptors will raise exceptions if they do not meet some definition
of correctness.
:param bool skipAnnotations: If ``True``, skip parsing everything before the
first ``r`` line.
:param descriptorClass: A class (probably from
:mod:`stem.descriptors.router_status_entry`, i.e.
:class:`stem.descriptor.router_status_entry.RouterStatusEntryV2` or
:class:`stem.descriptor.router_status_entry.RouterStatusEntryV3`)
which Stem will parse each descriptor it reads from **filename** into.
:raises InvalidRouterNickname: if one of the routers in the networkstatus
file had a nickname which does not conform to Tor's nickname
specification.
:raises ValueError: if the contents of a descriptor are malformed and
**validate** is ``True``.
:raises IOError: if the file at **filename** can't be read.
:rtype: list
:returns: A list of
:class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
"""
routers = []
logging.info("Parsing networkstatus file: %s" % filename)
with open(filename) as fh:
position = fh.tell()
if skipAnnotations:
while not fh.readline().startswith('r '):
position = fh.tell()
logging.debug("Skipping %d bytes of networkstatus file." % position)
fh.seek(position)
document = _parseNSFile(fh, validate, entry_class=descriptorClass)
try:
routers.extend(list(document))
except ValueError as error:
if "nickname isn't valid" in str(error):
raise InvalidRouterNickname(str(error))
else:
raise ValueError(str(error))
logging.info("Closed networkstatus file: %s" % filename)
return routers
[docs]def parseServerDescriptorsFile(filename, validate=True):
"""Open and parse **filename**, which should contain
``@type bridge-server-descriptor``.
.. note:: We have to lie to Stem, pretending that these are
``@type server-descriptor``, **not**
``@type bridge-server-descriptor``. See :trac:`11257`.
:param str filename: The file to parse descriptors from.
:param bool validate: Whether or not to validate descriptor
contents. (default: ``True``)
:rtype: list
:returns: A list of
:class:`stem.descriptor.server_descriptor.RelayDescriptor`s.
"""
logging.info("Parsing server descriptors with Stem: %s" % filename)
descriptorType = 'server-descriptor 1.0'
document = parse_file(filename, descriptorType, validate=validate)
routers = list(document)
return routers
def __cmp_published__(x, y):
"""A custom ``cmp()`` which sorts descriptors by published date.
:rtype: int
:returns: Return negative if x<y, zero if x==y, positive if x>y.
"""
if x.published < y.published:
return -1
elif x.published == y.published:
# This *shouldn't* happen. It would mean that two descriptors for
# the same router had the same timestamps, probably meaning there
# is a severely-messed up OR implementation out there. Let's log
# its fingerprint (no matter what!) so that we can look up its
# ``platform`` line in its server-descriptor and tell whoever
# wrote that code that they're probably (D)DOSing the Tor network.
logging.warn(("Duplicate descriptor with identical timestamp (%s) "
"for bridge %s with fingerprint %s !") %
(x.published, x.nickname, x.fingerprint))
return 0
elif x.published > y.published:
return 1
[docs]def deduplicate(descriptors, statistics=False):
"""Deduplicate some descriptors, returning only the newest for each router.
.. note:: If two descriptors for the same router are discovered, AND both
descriptors have the **same** published timestamp, then the router's
fingerprint WILL BE LOGGED ON PURPOSE, because we assume that router
to be broken or malicious.
:param list descriptors: A list of
:class:`stem.descriptor.server_descriptor.RelayDescriptor`,
:class:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`,
or :class:`stem.descriptor.router_status_entry.RouterStatusEntry`.
:param bool statistics: If ``True``, log some extra statistics about the
number of duplicates.
:rtype: dict
:returns: A dictionary mapping router fingerprints to their newest
available descriptor.
"""
duplicates = {}
newest = {}
for descriptor in descriptors:
fingerprint = descriptor.fingerprint
if fingerprint in duplicates:
duplicates[fingerprint].append(descriptor)
else:
duplicates[fingerprint] = [descriptor,]
for fingerprint, dupes in duplicates.items():
dupes.sort(cmp=__cmp_published__)
first = dupes.pop()
newest[fingerprint] = first
duplicates[fingerprint] = dupes
if statistics:
# sorted() won't sort by values (or anything that isn't the first item
# in its container), period, no matter what the cmp function is.
totals = sorted([(len(v), k,) for k, v in duplicates.viewitems()])
total = sum([k for (k, v) in totals])
bridges = len(duplicates)
top = 10 if bridges >= 10 else bridges
logging.info("Number of bridges with duplicates: %5d" % bridges)
logging.info("Total duplicate descriptors: %5d" % total)
logging.info("Bridges with the most duplicates (Top %d):" % top)
for i, (subtotal, bridge) in zip(range(1, top + 1), totals[:top]):
logging.info(" #%d %s: %d duplicates" % (i, bridge, subtotal))
logging.info("Descriptor deduplication finished.")
return newest