Source code for pyrocore.util.traits

# -*- coding: utf-8 -*-
# pylint: disable=
""" Classification.

    Copyright (c) 2010, 2011 The PyroScope Project <pyroscope.project@gmail.com>
"""
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from __future__ import absolute_import

import re
import logging
from collections import defaultdict

from pyrocore import config
from pyrocore.util import os

log = logging.getLogger(__name__)

# Sets of of extensions / kinds
KIND_AUDIO = set(("flac", "mp3", "ogg", "wav", "dts", "ac3", "alac", "wma"))
KIND_VIDEO = set(("avi", "mkv", "m4v", "vob", "mp4", "mpg", "mpeg", "m2ts", "ts", "ogv", "wmv"))
KIND_IMAGE = set(("jpg", "png", "gif", "tif", "bmp", "svg"))
KIND_DOCS = set(("chm", "pdf", "cbr", "cbz", "odt", "ods", "doc", "xls", "ppt", "epub", "mobi", "azw3", "djvu"))
KIND_ARCHIVE = set(("rar", "zip", "tgz", "bz2", "iso", "bin"))

# Regex matchers for names
_VIDEO_EXT = '|'.join(re.escape('.' + _i) for _i in KIND_VIDEO)
_TV_TRAIL = (
    r"(?:[._ ](?P<release_tags>PREAIR|READNFO))?"
    r"(?:[._ ](?P<release>REPACK|PROPER|REAL|REALPROPER|INTERNAL))?"
    r"(?:[._ ](?P<aspect>WS))?"
    r"(?:[._ ](?P<format>HDTV|PDTV|DSR|DVD[59]?|DVDSCR|480p|576p|720p|1080p|1080i|2160p))?"
    r"(?:[._ ](?P<release2>WEB-DL|WEB\.DL|WEBRip))?"
    r"(?:[._ ](?P<format2>HDTV|PDTV|DSR|DVD[59]?|DVDSCR|480p|576p|720p|1080p|1080i|2160p))?"
    r"(?:[._ ](?P<codec>[XH]\.?264|XviD|VTS|ISO|NTSC|PAL))?"
    r"(?:[._ ](?P<sound>MP3|AC3|DD5\.1|L?PCM|AAC 2\.0))?"
    r"(?:[._ ](?P<codec2>[XH]\.?264|XviD|VTS|ISO|NTSC|PAL))?"
    r"(?:[-. ](?P<group>.+?))?(?P<extension>" + _VIDEO_EXT + ")?$"
)
_DEFINITELY_TV = [".%s." % _i.lower() for _i in ("HDTV", "PDTV", "DSR")]

TV_PATTERNS = [(_k, re.compile(_i, re.I)) for _k, _i in (
    ("Normal TV Episodes",
        r"^(?P<show>.+?)[._ ]S?(?P<season>\d{1,2})[xE](?P<episode>\d{2}(?:-?E\d{2})?)"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Normal TV Episodes (all-numeric season+episode)",
        r"^(?P<show>.+?)[._ ](?P<season>\d)(?P<episode>\d{2})"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Daily Shows",
        r"^(?P<show>.+?)[._ ](?P<date>\d{4}\.\d{2}\.\d{2})"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Full Seasons",
        r"^(?P<show>.+?)[._ ]S?(?P<season>\d{1,2})" + _TV_TRAIL
    ),
    ("Mini Series",
        r"^(?P<show>.+?)"
        r"(?:[._ ](?:Part(?P<part>\d+?)|Pilot)){1,2}"
        #         (?P<year>\d{4})| creates false positives for movies!
        r"(?:[._ ](?P<title>.+?[a-z]{1,2}.+?))??"
        + _TV_TRAIL
    ),
    ("Mini Series (Roman numerals)",
        r"^(?P<show>.+?)"
        r"(?:[._ ]Pa?r?t[._ ](?P<part>[ivxIVX]{1,3}?))"
        r"(?:[._ ](?P<title>.+?[a-z]{1,2}.+?))??"
        + _TV_TRAIL
    ),
)]

MOVIE_PATTERNS = [(_k, re.compile(_i, re.I)) for _k, _i in (
    ("Scene tagged movie",
        r"^(?P<title>.+?)[. ][[(]?(?P<year>\d{4})[)\]]?"
        r"(?:[._ ](?P<release>UNRATED|REPACK|INTERNAL|L[iI]M[iI]TED))*"
        r"(?:[._ ](?P<format>480p|576p|720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<source>BDRip|BRRip|HDRip|DVDRip|PAL|NTSC))"
        r"(?:[._ ](?P<sound1>MP3|AC3|FLAC|DTS(?:-HD)?))?"
        r"(?:[._ ](?P<codec1>xvid|divx|avc|x264|hevc|h265))?"
        r"(?:[._ ](?P<sound2>MP3|AC3|FLAC|DTS(?:-HD)?))?"
        #r"(?:[._ ](?P<channels>6ch))?"
        r"(?:[-.](?P<group>.+?))?"
        r"(?P<extension>" + _VIDEO_EXT + ")?$"
    ),
    ("Blu-ray movie",
        r"^(?P<title>.+?)[. ][[(]?(?P<year>\d{4})[)\]]?"
        r"(?:[._ ](?P<release>UNRATED|REPACK|INTERNAL|MULTI|L[iI]M[iI]TED))*"
        r"(?:[._ ](?P<format0>720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<source>Blu-ray|BluRay|BD25|BD50))"
        r"(?:[._ ](?P<format>720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<codec1>avc|x264|hevc|h265))?"
        r"(?:[._ ](?P<sound>DTS(?:-HD)?))*"
        r"(?:[._ ](?P<channels>6ch|MA.5.1))?"
        r"(?:[._ ](?P<codec2>avc|x264|hevc|h265))?"
        r"(?:[-.](?P<group>.+?))?"
        r"(?P<extension>" + _VIDEO_EXT + ")?$"
    ),
)]

BAD_TITLE_WORDS = set((
    "bdrip", "brrip", "hdrip", "dvdrip", "ntsc",
    "hdtv", "dvd-r", "dvdr", "dvd5", "dvd9", "web-dl",
    "blu-ray", "bluray", "bd25", "bd50",
    "480p", "576p", "720p", "1080p", "2160p",
    "mp3", "ac3", "dts",
))

del _k, _i


[docs]def get_filetypes(filelist, path=None, size=os.path.getsize):
    """ Get a sorted list of file types and their weight in percent
        from an iterable of file names.

        @return: List of weighted file extensions (no '.'), sorted in descending order
        @rtype: list of (weight, filetype)
    """
    path = path or (lambda _: _)

    # Get total size for each file extension
    histo = defaultdict(int)
    for entry in filelist:
        ext = os.path.splitext(path(entry))[1].lstrip('.').lower()
        if ext and ext[0] == 'r' and ext[1:].isdigit():
            ext = "rar"
        elif ext == "jpeg":
            ext = "jpg"
        elif ext == "mpeg":
            ext = "mpg"
        histo[ext] += size(entry)

    # Normalize values to integer percent
    total = sum(histo.values())
    if total:
        for ext, val in histo.items():
            histo[ext] = int(val * 100.0 / total + .499)

    return sorted(zip(histo.values(), histo.keys()), reverse=True)


[docs]def name_trait(name, add_info=False):
    """ Determine content type from name.
    """
    kind, info = None, {}

    # Anything to check against?
    if name and not name.startswith("VTS_"):
        lower_name = name.lower()
        trait_patterns = (("tv", TV_PATTERNS, "show"), ("movie", MOVIE_PATTERNS, "title"))

        # TV check
        if any(i in lower_name for i in _DEFINITELY_TV):
            kind = "tv"
            trait_patterns = trait_patterns[:1]

        # Regex checks
        re_name = '.'.join([i.lstrip('[(').rstrip(')]') for i in name.split(' .')])
        for trait, patterns, title_group in trait_patterns:
            matched, patname = None, None

            for patname, pattern in patterns:
                matched = pattern.match(re_name)
                ##print matched, patname, re_name; print "   ", pattern.pattern
                if matched and not any(i in matched.groupdict()[title_group].lower() for i in BAD_TITLE_WORDS):
                    kind, info = trait, matched.groupdict()
                    break

            if matched:
                info["pattern"] = patname

                # Fold auxiliary groups into main one
                for key, val in list(info.items()):
                    if key[-1].isdigit():
                        del info[key]
                        if val:
                            key = re.sub("[0-9]+$", "", key)
                            info[key] = ("%s %s" % (info.get(key) or "", val)).strip()
                break

        # TODO: Split by "dvdrip", year, etc. to get to the title and then
        # do a imdb / tvdb lookup; cache results, hits for longer, misses
        # for a day at max.

    # Return requested result
    return (kind, info) if add_info else kind


[docs]def detect_traits(name=None, alias=None, filetype=None):
    """ Build traits list from passed attributes.

        The result is a list of hierarchical classifiers, the top-level
        consisting of "audio", "movie", "tv", "video", "document", etc.
        It can be used as a part of completion paths to build directory
        structures.
    """
    result = []
    if filetype:
        filetype = filetype.lstrip('.')

    # Check for "themed" trackers
    theme = config.traits_by_alias.get(alias)
    if alias and theme:
        result = [theme, filetype or "other"]

    # Guess from file extensionn and name
    elif filetype in KIND_AUDIO:
        result = ["audio", filetype]
    elif filetype in KIND_VIDEO:
        result = ["video", filetype]

        contents = name_trait(name)
        if contents:
            result = [contents, filetype]
    elif filetype in KIND_IMAGE:
        result = ["img", filetype]
    elif filetype in KIND_DOCS:
        result = ["docs", filetype]
    elif filetype in KIND_ARCHIVE:
        result = ["misc", filetype]

        contents = name_trait(name)
        if contents:
            result = [contents, filetype]

    return result