Source code for pyrocore.util.traits

# -*- coding: utf-8 -*-
# pylint: disable=
""" Classification.

    Copyright (c) 2010, 2011 The PyroScope Project <pyroscope.project@gmail.com>
"""
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from __future__ import absolute_import

import re
import logging
from collections import defaultdict

from pyrocore import config
from pyrocore.util import os

log = logging.getLogger(__name__)

# Sets of of extensions / kinds
KIND_AUDIO = set(("flac", "mp3", "ogg", "wav", "dts", "ac3", "alac", "wma"))
KIND_VIDEO = set(("avi", "mkv", "m4v", "vob", "mp4", "mpg", "mpeg", "m2ts", "ts", "ogv", "wmv"))
KIND_IMAGE = set(("jpg", "png", "gif", "tif", "bmp", "svg"))
KIND_DOCS = set(("chm", "pdf", "cbr", "cbz", "odt", "ods", "doc", "xls", "ppt", "epub", "mobi", "azw3", "djvu"))
KIND_ARCHIVE = set(("rar", "zip", "tgz", "bz2", "iso", "bin"))

# Regex matchers for names
_VIDEO_EXT = '|'.join(re.escape('.' + _i) for _i in KIND_VIDEO)
_TV_TRAIL = (
    r"(?:[._ ](?P<release_tags>PREAIR|READNFO))?"
    r"(?:[._ ](?P<release>REPACK|PROPER|REAL|REALPROPER|INTERNAL))?"
    r"(?:[._ ](?P<aspect>WS))?"
    r"(?:[._ ](?P<format>HDTV|PDTV|DSR|DVD[59]?|DVDSCR|480p|576p|720p|1080p|1080i|2160p))?"
    r"(?:[._ ](?P<release2>WEB-DL|WEB\.DL|WEBRip))?"
    r"(?:[._ ](?P<format2>HDTV|PDTV|DSR|DVD[59]?|DVDSCR|480p|576p|720p|1080p|1080i|2160p))?"
    r"(?:[._ ](?P<codec>[XH]\.?264|XviD|VTS|ISO|NTSC|PAL))?"
    r"(?:[._ ](?P<sound>MP3|AC3|DD5\.1|L?PCM|AAC 2\.0))?"
    r"(?:[._ ](?P<codec2>[XH]\.?264|XviD|VTS|ISO|NTSC|PAL))?"
    r"(?:[-. ](?P<group>.+?))?(?P<extension>" + _VIDEO_EXT + ")?$"
)
_DEFINITELY_TV = [".%s." % _i.lower() for _i in ("HDTV", "PDTV", "DSR")]

TV_PATTERNS = [(_k, re.compile(_i, re.I)) for _k, _i in (
    ("Normal TV Episodes",
        r"^(?P<show>.+?)[._ ]S?(?P<season>\d{1,2})[xE](?P<episode>\d{2}(?:-?E\d{2})?)"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Normal TV Episodes (all-numeric season+episode)",
        r"^(?P<show>.+?)[._ ](?P<season>\d)(?P<episode>\d{2})"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Daily Shows",
        r"^(?P<show>.+?)[._ ](?P<date>\d{4}\.\d{2}\.\d{2})"
        r"(?:[._ ](?P<title>.+?[a-zA-Z]{1,2}.+?))?"
        + _TV_TRAIL
    ),
    ("Full Seasons",
        r"^(?P<show>.+?)[._ ]S?(?P<season>\d{1,2})" + _TV_TRAIL
    ),
    ("Mini Series",
        r"^(?P<show>.+?)"
        r"(?:[._ ](?:Part(?P<part>\d+?)|Pilot)){1,2}"
        #         (?P<year>\d{4})| creates false positives for movies!
        r"(?:[._ ](?P<title>.+?[a-z]{1,2}.+?))??"
        + _TV_TRAIL
    ),
    ("Mini Series (Roman numerals)",
        r"^(?P<show>.+?)"
        r"(?:[._ ]Pa?r?t[._ ](?P<part>[ivxIVX]{1,3}?))"
        r"(?:[._ ](?P<title>.+?[a-z]{1,2}.+?))??"
        + _TV_TRAIL
    ),
)]

MOVIE_PATTERNS = [(_k, re.compile(_i, re.I)) for _k, _i in (
    ("Scene tagged movie",
        r"^(?P<title>.+?)[. ][[(]?(?P<year>\d{4})[)\]]?"
        r"(?:[._ ](?P<release>UNRATED|REPACK|INTERNAL|L[iI]M[iI]TED))*"
        r"(?:[._ ](?P<format>480p|576p|720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<source>BDRip|BRRip|HDRip|DVDRip|PAL|NTSC))"
        r"(?:[._ ](?P<sound1>MP3|AC3|FLAC|DTS(?:-HD)?))?"
        r"(?:[._ ](?P<codec1>xvid|divx|avc|x264|hevc|h265))?"
        r"(?:[._ ](?P<sound2>MP3|AC3|FLAC|DTS(?:-HD)?))?"
        #r"(?:[._ ](?P<channels>6ch))?"
        r"(?:[-.](?P<group>.+?))?"
        r"(?P<extension>" + _VIDEO_EXT + ")?$"
    ),
    ("Blu-ray movie",
        r"^(?P<title>.+?)[. ][[(]?(?P<year>\d{4})[)\]]?"
        r"(?:[._ ](?P<release>UNRATED|REPACK|INTERNAL|MULTI|L[iI]M[iI]TED))*"
        r"(?:[._ ](?P<format0>720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<source>Blu-ray|BluRay|BD25|BD50))"
        r"(?:[._ ](?P<format>720p|1080p|1080i|2160p))?"
        r"(?:[._ ](?P<codec1>avc|x264|hevc|h265))?"
        r"(?:[._ ](?P<sound>DTS(?:-HD)?))*"
        r"(?:[._ ](?P<channels>6ch|MA.5.1))?"
        r"(?:[._ ](?P<codec2>avc|x264|hevc|h265))?"
        r"(?:[-.](?P<group>.+?))?"
        r"(?P<extension>" + _VIDEO_EXT + ")?$"
    ),
)]

BAD_TITLE_WORDS = set((
    "bdrip", "brrip", "hdrip", "dvdrip", "ntsc",
    "hdtv", "dvd-r", "dvdr", "dvd5", "dvd9", "web-dl",
    "blu-ray", "bluray", "bd25", "bd50",
    "480p", "576p", "720p", "1080p", "2160p",
    "mp3", "ac3", "dts",
))

del _k, _i


[docs]def get_filetypes(filelist, path=None, size=os.path.getsize): """ Get a sorted list of file types and their weight in percent from an iterable of file names. @return: List of weighted file extensions (no '.'), sorted in descending order @rtype: list of (weight, filetype) """ path = path or (lambda _: _) # Get total size for each file extension histo = defaultdict(int) for entry in filelist: ext = os.path.splitext(path(entry))[1].lstrip('.').lower() if ext and ext[0] == 'r' and ext[1:].isdigit(): ext = "rar" elif ext == "jpeg": ext = "jpg" elif ext == "mpeg": ext = "mpg" histo[ext] += size(entry) # Normalize values to integer percent total = sum(histo.values()) if total: for ext, val in histo.items(): histo[ext] = int(val * 100.0 / total + .499) return sorted(zip(histo.values(), histo.keys()), reverse=True)
[docs]def name_trait(name, add_info=False): """ Determine content type from name. """ kind, info = None, {} # Anything to check against? if name and not name.startswith("VTS_"): lower_name = name.lower() trait_patterns = (("tv", TV_PATTERNS, "show"), ("movie", MOVIE_PATTERNS, "title")) # TV check if any(i in lower_name for i in _DEFINITELY_TV): kind = "tv" trait_patterns = trait_patterns[:1] # Regex checks re_name = '.'.join([i.lstrip('[(').rstrip(')]') for i in name.split(' .')]) for trait, patterns, title_group in trait_patterns: matched, patname = None, None for patname, pattern in patterns: matched = pattern.match(re_name) ##print matched, patname, re_name; print " ", pattern.pattern if matched and not any(i in matched.groupdict()[title_group].lower() for i in BAD_TITLE_WORDS): kind, info = trait, matched.groupdict() break if matched: info["pattern"] = patname # Fold auxiliary groups into main one for key, val in list(info.items()): if key[-1].isdigit(): del info[key] if val: key = re.sub("[0-9]+$", "", key) info[key] = ("%s %s" % (info.get(key) or "", val)).strip() break # TODO: Split by "dvdrip", year, etc. to get to the title and then # do a imdb / tvdb lookup; cache results, hits for longer, misses # for a day at max. # Return requested result return (kind, info) if add_info else kind
[docs]def detect_traits(name=None, alias=None, filetype=None): """ Build traits list from passed attributes. The result is a list of hierarchical classifiers, the top-level consisting of "audio", "movie", "tv", "video", "document", etc. It can be used as a part of completion paths to build directory structures. """ result = [] if filetype: filetype = filetype.lstrip('.') # Check for "themed" trackers theme = config.traits_by_alias.get(alias) if alias and theme: result = [theme, filetype or "other"] # Guess from file extensionn and name elif filetype in KIND_AUDIO: result = ["audio", filetype] elif filetype in KIND_VIDEO: result = ["video", filetype] contents = name_trait(name) if contents: result = [contents, filetype] elif filetype in KIND_IMAGE: result = ["img", filetype] elif filetype in KIND_DOCS: result = ["docs", filetype] elif filetype in KIND_ARCHIVE: result = ["misc", filetype] contents = name_trait(name) if contents: result = [contents, filetype] return result