from tempfile import mkdtemp
from pathlib import Path
import requests
from ocrd.constants import TMP_PREFIX
from ocrd_utils import (
getLogger,
is_local_filename,
get_local_filename,
remove_non_path_from_url,
nth_url_segment
)
from ocrd.workspace import Workspace
from ocrd_models import OcrdMets
from ocrd_models.constants import NAMESPACES as NS
from ocrd_models.utils import handle_oai_response
[docs]class Resolver():
"""
Handle uploads, downloads, repository access, and manage temporary directories
"""
[docs] def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
"""
Download a file to a directory.
Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
If `basename` is not given but subdir is, assume user knows what she's doing and
use last URL segment as the basename.
If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
Args:
directory (string): Directory to download files to
basename (string, None): basename part of the filename on disk.
url (string): URL to download from
if_exists (string, "skip"): What to do if target file already exists. \
One of ``skip`` (default), ``overwrite`` or ``raise``
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
Returns:
Local filename string, *relative* to directory
"""
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
if not url:
raise Exception("'url' must be a string")
if not directory:
raise Exception("'directory' must be a string") # actually Path would also work
directory = Path(directory)
directory.mkdir(parents=True, exist_ok=True)
directory = str(directory.resolve())
subdir_path = Path(subdir if subdir else '')
basename_path = Path(basename if basename else nth_url_segment(url))
ret = str(Path(subdir_path, basename_path))
dst_path = Path(directory, ret)
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
# print('url=%s', url)
# print('directory=%s', directory)
# print('subdir_path=%s', subdir_path)
# print('basename_path=%s', basename_path)
# print('ret=%s', ret)
# print('dst_path=%s', dst_path)
src_path = None
if is_local_filename(url):
try:
# XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
src_path = Path(get_local_filename(url)).resolve()
except FileNotFoundError as e:
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
raise e
if not src_path.exists():
raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
if src_path == dst_path:
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
return ret
# Respect 'if_exists' arg
if dst_path.exists():
if if_exists == 'skip':
return ret
if if_exists == 'raise':
raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
# Create dst_path parent dir
dst_path.parent.mkdir(parents=True, exist_ok=True)
# Copy files or download remote assets
if src_path:
log.debug("Copying file '%s' to '%s'", src_path, dst_path)
dst_path.write_bytes(src_path.read_bytes())
else:
log.debug("Downloading URL '%s' to '%s'", url, dst_path)
response = requests.get(url)
if response.status_code != 200:
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
contents = handle_oai_response(response)
dst_path.write_bytes(contents)
return ret
[docs] def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
"""
Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
Arguments:
mets_url (string): Source METS URL or filesystem path
Keyword Arguments:
dst_dir (string, None): Target directory for the workspace. \
By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
By default existing ``mets.xml`` will raise an exception.
download (boolean, False): Whether to also download all the files referenced by the METS
src_baseurl (string, None): Base URL for resolving relative file locations
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
the former is already local and the latter is ``none`` or already identical to its directory name.
Returns:
a new :py:class:`~ocrd.workspace.Workspace`
"""
log = getLogger('ocrd.resolver.workspace_from_url')
if mets_url is None:
raise ValueError("Must pass 'mets_url' workspace_from_url")
# if mets_url is a relative filename, make it absolute
if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
mets_url = str(Path(Path.cwd() / mets_url))
# if mets_basename is not given, use the last URL segment of the mets_url
if mets_basename is None:
mets_basename = nth_url_segment(mets_url, -1)
# If src_baseurl wasn't given, determine from mets_url by removing last url segment
if not src_baseurl:
last_segment = nth_url_segment(mets_url)
src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
# resolve dst_dir
if not dst_dir:
if is_local_filename(mets_url):
log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
dst_dir = Path(mets_url).parent
else:
log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
dst_dir = mkdtemp(prefix=TMP_PREFIX)
# XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
if not Path(dst_dir).exists():
Path(dst_dir).mkdir(parents=True, exist_ok=False)
dst_dir = str(Path(dst_dir).resolve())
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
mets_basename, mets_url, src_baseurl, dst_dir)
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)
if download:
for f in workspace.mets.find_files():
workspace.download_file(f)
return workspace
[docs] def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
"""
Create an empty workspace.
Arguments:
directory (string): Target directory for the workspace. \
If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
Keyword Arguments:
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
By default existing ``mets.xml`` will raise an exception.
Returns:
a new :py:class:`~ocrd.workspace.Workspace`
"""
log = getLogger('ocrd.resolver.workspace_from_nothing')
if directory is None:
directory = mkdtemp(prefix=TMP_PREFIX)
Path(directory).mkdir(parents=True, exist_ok=True)
mets_path = Path(directory, mets_basename)
if mets_path.exists() and not clobber_mets:
raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
mets = OcrdMets.empty_mets()
log.info("Writing METS to %s", mets_path)
mets_path.write_bytes(mets.to_xml(xmllint=True))
return Workspace(self, directory, mets, mets_basename=mets_basename)