Source code for ocrd.processor.helpers

Helper methods for running and documenting processors
from time import perf_counter, process_time
import json
import inspect
from subprocess import run, PIPE

from click import wrap_text
from ocrd_utils import getLogger

__all__ = [

def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
    if workspace is None:
        if resolver is None:
            raise Exception("Need to pass a resolver to create a workspace")
        if mets_url is None:
            raise Exception("Need to pass mets_url to create a workspace")
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
    return workspace

[docs]def run_processor( processorClass, ocrd_tool=None, mets_url=None, resolver=None, workspace=None, page_id=None, log_level=None, # TODO actually use this! input_file_grp=None, output_file_grp=None, show_resource=None, list_resources=False, parameter=None, parameter_override=None, working_dir=None, ): # pylint: disable=too-many-locals """ Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace. If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`) by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace). Instantiate a Python object for :py:attr:`processorClass`, passing: - the workspace, - :py:attr:`ocrd_tool` - :py:attr:`page_id` - :py:attr:`input_file_grp` - :py:attr:`output_file_grp` - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings) Run the processor on the workspace (creating output files in the filesystem). Finally, write back the workspace (updating the METS in the filesystem). Args: processorClass (object): Python class of the module processor. """ workspace = _get_workspace( workspace, resolver, mets_url, working_dir ) log = getLogger('ocrd.processor.helpers.run_processor') log.debug("Running processor %s", processorClass) processor = processorClass( workspace, ocrd_tool=ocrd_tool, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, parameter=parameter ) ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) otherrole = ocrd_tool['steps'][0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() processor.process() t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu"Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % ( ocrd_tool['executable'], t1_wall, t1_cpu, input_file_grp if input_file_grp else '', output_file_grp if output_file_grp else '', json.dumps(parameter) if parameter else {} )) workspace.mets.add_agent( name=name, _type='OTHER', othertype='SOFTWARE', role='OTHER', otherrole=otherrole ) workspace.save_mets() return processor
[docs]def run_cli( executable, mets_url=None, resolver=None, workspace=None, page_id=None, overwrite=None, log_level=None, input_file_grp=None, output_file_grp=None, parameter=None, working_dir=None, ): """ Open a workspace and run a processor on the command line. If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`) by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace). Run the processor CLI :py:attr:`executable` on the workspace, passing: - the workspace, - :py:attr:`page_id` - :py:attr:`input_file_grp` - :py:attr:`output_file_grp` - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings) (Will create output files and update the in the filesystem). Args: executable (string): Executable name of the module processor. """ workspace = _get_workspace(workspace, resolver, mets_url, working_dir) args = [executable, '--working-dir',] args += ['--mets', mets_url] if log_level: args += ['--log-level', log_level] if page_id: args += ['--page-id', page_id] if input_file_grp: args += ['--input-file-grp', input_file_grp] if output_file_grp: args += ['--output-file-grp', output_file_grp] if parameter: args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] log = getLogger('ocrd.processor.helpers.run_cli') log.debug("Running subprocess '%s'", ' '.join(args)) result = run(args, check=False) return result.returncode
[docs]def generate_processor_help(ocrd_tool, processor_instance=None): """Generate a string describing the full CLI of this processor including params. Args: ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` processor_instance (object, optional): the processor implementation (for adding any module/class/function docstrings) """ parameter_help = '' if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: parameter_help = ' NONE\n' else: def wrap(s): return wrap_text(s, initial_indent=' '*3, subsequent_indent=' '*4, width=72, preserve_paragraphs=True) for param_name, param in ocrd_tool['parameters'].items(): parameter_help += wrap('"%s" [%s%s]' % ( param_name, param['type'], ' - REQUIRED' if 'required' in param and param['required'] else ' - %s' % json.dumps(param['default']) if 'default' in param else '')) parameter_help += '\n ' + wrap(param['description']) if 'enum' in param: parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) parameter_help += "\n" doc_help = '' if processor_instance: module = inspect.getmodule(processor_instance) if module and module.__doc__: doc_help += '\n' + inspect.cleandoc(module.__doc__) if processor_instance.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) if processor_instance.process.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) if doc_help: doc_help = '\n\n' + wrap_text(doc_help, width=72, initial_indent=' > ', subsequent_indent=' > ', preserve_paragraphs=True) return ''' Usage: %s [OPTIONS] %s%s Options: -I, --input-file-grp USE File group(s) used as input -O, --output-file-grp USE File group(s) used as output -g, --page-id ID Physical page ID(s) to process --overwrite Remove existing output pages/images (with --page-id, remove only those) -p, --parameter JSON-PATH Parameters, either verbatim JSON string or JSON file path -P, --param-override KEY VAL Override a single JSON object key-value pair, taking precedence over --parameter -m, --mets URL-PATH URL or file path of METS to process -w, --working-dir PATH Working directory of local workspace -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] Log level -C, --show-resource RESNAME Dump the content of processor resource RESNAME -L, --list-resources List names of processor resources -J, --dump-json Dump tool description as JSON and exit -h, --help This help message -V, --version Show version Parameters: %s Default Wiring: %s -> %s ''' % ( ocrd_tool['executable'], ocrd_tool['description'], doc_help, parameter_help, ocrd_tool.get('input_file_grp', 'NONE'), ocrd_tool.get('output_file_grp', 'NONE') )