Source code for rosetta.text.filefilter
"""
Contains a collection of function that clean, decode and move files around.
"""
from fnmatch import fnmatch
import os
import re
from ..common import lazyprop
[docs]def get_paths(
base_path, file_type="*", relative=False, get_iter=False, limit=None):
"""
Crawls subdirectories and returns an iterator over paths to files that
match the file_type.
Parameters
----------
base_path : String
Path to the directory that will be crawled
file_type : String
String to filter files with. E.g. '*.txt'. Note that the filenames
will be converted to lowercase before this comparison.
relative : Boolean
If True, get paths relative to base_path
If False, get absolute paths
get_iter : Boolean
If True, return an iterator over paths rather than a list.
"""
path_iter = _get_paths_iter(
base_path, file_type=file_type, relative=relative, limit=limit)
if get_iter:
return path_iter
else:
return [path for path in path_iter]
def _get_paths_iter(base_path, file_type="*", relative=False, limit=None):
counter = 0
for path, subdirs, files in os.walk(base_path, followlinks=True):
for name in files:
if fnmatch(name.lower(), file_type):
if relative:
path = path.replace(base_path, "")
if path.startswith('/'):
path = path[1:]
if counter == limit:
raise StopIteration
yield os.path.join(path, name)
counter += 1
[docs]def path_to_name(path, strip_ext=True):
"""
Takes one path and returns the filename, excluding the extension.
"""
head, name = os.path.split(path)
if strip_ext:
name, ext = os.path.splitext(name)
return name
[docs]def path_to_newname(path, name_level=1):
"""
Takes one path and returns a new name, combining the directory structure
with the filename.
Parameters
----------
path : String
name_level : Integer
Form the name using items this far back in the path. E.g. if
path = mydata/1234/3.txt and name_level == 2, then name = 1234_3
Returns
-------
name : String
"""
name_plus_ext = path.split('/')[-name_level:]
name, ext = os.path.splitext('_'.join(name_plus_ext))
return name
[docs]class PathFinder(object):
"""
Find and access paths in a directory tree.
"""
def __init__(
self, text_base_path=None, file_type='*', name_strip=r'\..*',
limit=None):
"""
Parameters
----------
text_base_path : String
Base path that will be crawled to find paths.
file_type : String
Glob expression filtering the file type
name_strip : String (Regex)
To convert filenames to doc_id, we strip this pattern
Default pattern r'\..*' strips everything after the first period
limit : Integer
Limit the paths returned to this number
"""
self.text_base_path = text_base_path
self.file_type = file_type
self.name_strip = name_strip
self.limit = limit
@lazyprop
def paths(self):
"""
Get all paths that we will use.
"""
if self.text_base_path:
paths = get_paths(
self.text_base_path, self.file_type, limit=self.limit)
else:
paths = None
return paths
@lazyprop
def doc_id(self):
"""
Get doc_id corresponding to all paths.
"""
regex = re.compile(self.name_strip)
doc_id = [
regex.sub('', path_to_name(p, strip_ext=False))
for p in self.paths]
return doc_id
@lazyprop
def _doc_id_to_path(self):
"""
Build the dictionary mapping doc_id to path. doc_id is based on
the filename.
"""
return dict(zip(self.doc_id, self.paths))
def __getitem__(self, identifiers):
"""
self[identifiers] returns a list of paths corresponding to identifiers.
"""
if isinstance(identifiers, str):
identifiers = [identifiers]
return [self._doc_id_to_path[str(doc_id)] for doc_id in identifiers]