Source code for rosetta.text.filefilter

"""
Contains a collection of function that clean, decode and move files around.
"""
from fnmatch import fnmatch
import os
import re

from ..common import lazyprop


[docs]def get_paths( base_path, file_type="*", relative=False, get_iter=False, limit=None): """ Crawls subdirectories and returns an iterator over paths to files that match the file_type. Parameters ---------- base_path : String Path to the directory that will be crawled file_type : String String to filter files with. E.g. '*.txt'. Note that the filenames will be converted to lowercase before this comparison. relative : Boolean If True, get paths relative to base_path If False, get absolute paths get_iter : Boolean If True, return an iterator over paths rather than a list. """ path_iter = _get_paths_iter( base_path, file_type=file_type, relative=relative, limit=limit) if get_iter: return path_iter else: return [path for path in path_iter]
def _get_paths_iter(base_path, file_type="*", relative=False, limit=None): counter = 0 for path, subdirs, files in os.walk(base_path, followlinks=True): for name in files: if fnmatch(name.lower(), file_type): if relative: path = path.replace(base_path, "") if path.startswith('/'): path = path[1:] if counter == limit: raise StopIteration yield os.path.join(path, name) counter += 1
[docs]def path_to_name(path, strip_ext=True): """ Takes one path and returns the filename, excluding the extension. """ head, name = os.path.split(path) if strip_ext: name, ext = os.path.splitext(name) return name
[docs]def path_to_newname(path, name_level=1): """ Takes one path and returns a new name, combining the directory structure with the filename. Parameters ---------- path : String name_level : Integer Form the name using items this far back in the path. E.g. if path = mydata/1234/3.txt and name_level == 2, then name = 1234_3 Returns ------- name : String """ name_plus_ext = path.split('/')[-name_level:] name, ext = os.path.splitext('_'.join(name_plus_ext)) return name
[docs]class PathFinder(object): """ Find and access paths in a directory tree. """ def __init__( self, text_base_path=None, file_type='*', name_strip=r'\..*', limit=None): """ Parameters ---------- text_base_path : String Base path that will be crawled to find paths. file_type : String Glob expression filtering the file type name_strip : String (Regex) To convert filenames to doc_id, we strip this pattern Default pattern r'\..*' strips everything after the first period limit : Integer Limit the paths returned to this number """ self.text_base_path = text_base_path self.file_type = file_type self.name_strip = name_strip self.limit = limit @lazyprop def paths(self): """ Get all paths that we will use. """ if self.text_base_path: paths = get_paths( self.text_base_path, self.file_type, limit=self.limit) else: paths = None return paths @lazyprop def doc_id(self): """ Get doc_id corresponding to all paths. """ regex = re.compile(self.name_strip) doc_id = [ regex.sub('', path_to_name(p, strip_ext=False)) for p in self.paths] return doc_id @lazyprop def _doc_id_to_path(self): """ Build the dictionary mapping doc_id to path. doc_id is based on the filename. """ return dict(zip(self.doc_id, self.paths)) def __getitem__(self, identifiers): """ self[identifiers] returns a list of paths corresponding to identifiers. """ if isinstance(identifiers, str): identifiers = [identifiers] return [self._doc_id_to_path[str(doc_id)] for doc_id in identifiers]