Source code for nhlscrapi.scrapr.reportloader

import re

from lxml.html import fromstring

from nhlscrapi.scrapr.nhlreq import NHLCn
from nhlscrapi.scrapr import teamnameparser as TP

# enforce one method interface that fully parses doc
from abc import ABCMeta, abstractmethod

class ReportLoader(object):
[docs] """ Base class for objects that load full reports. Manages html request and extracts match up from banner :param game_key: unique game identifier of type :py:class:`` :param report_type: str, type of report being loaded. Must be a method of :py:class:`.NHLCn` """ __metaclass__ = ABCMeta __lx_doc = None def __init__(self, game_key, report_type=''): self.game_key = game_key """Game key being retrieved of type :py:class:`` """ self.report_type = report_type """Type of report to be loaded. Valid types correspond to the methods of :py:class:`.NHLCn`""" self.matchup = { } """ Fame meta information displayed in report banners including team names, final score, game date, location, and attendance. Data format is .. code:: python { 'home': home, 'away': away, 'final': final, 'attendance': att, 'date': date, 'location': loc } """ self.req_err = None """Error from http request""" def html_doc(self):
[docs] """ :returns: the lxml processed html document :rtype: ``lxml.html.document_fromstring`` output """ if self.__lx_doc is None: cn = NHLCn() if hasattr(cn, self.report_type): html = getattr(cn, self.report_type)(self.game_key) else: raise ValueError('Invalid report type: %s' % self.report_type) if cn.req_err is None: self.__lx_doc = fromstring(html) else: self.req_err = cn.req_err return self.__lx_doc def parse_matchup(self):
[docs] """ Parse the banner matchup meta info for the game. :returns: ``self`` on success or ``None`` """ lx_doc = self.html_doc() try: if not self.matchup: self.matchup = self._fill_meta(lx_doc) return self except: return None @abstractmethod
def parse(self):
[docs] """ Fully parses html document. :returns: ``self`` on success, ``None`` otherwise """ return self.parse_matchup() def _fill_meta(self, doc):
def team_scr(doc, t): xp = ''.join(['//table[@id="', t, '"]']) team = doc.xpath(xp)[0] team = [s for s in team.xpath('.//text()') if s.lower() != t.lower() and '\r\n' not in s and 'game' not in s.lower()] return team final = { } final['away'], at = tuple(team_scr(doc, 'Visitor')) final['home'], ht = tuple(team_scr(doc, 'Home')) # clean team names away = TP.team_name_parser(at) home = TP.team_name_parser(ht) game_info = doc.xpath('//table[@id="GameInfo"]')[0].xpath('.//text()') game_info = '; '.join(s.strip() for s in game_info if s.strip() != '') att = re.findall(r'(?<=[aA]ttendance\s)(\d*\,?\d*)', game_info) att = int(att[0].replace(',','')) if att else 0 date = re.findall(r'\w+\,?\s\w+\s\d+\,?\s\d+', game_info) date = date[0] if date else '' loc = re.findall(r'(?<=at\W)([^\;]*)', game_info) loc = loc[0] if loc else '' return { 'home': home, 'away': away, 'final': final, 'attendance': att, 'date': date, 'location': loc }