Source code for ocrd_models.utils

"""
Utilities for ocrd_models
"""
from lxml import etree as ET

from ocrd_utils import getLogger
from .constants import NAMESPACES as NS

__all__ = [
    'xmllint_format',
    'handle_oai_response',
    'is_oai_content',
    'extract_mets_from_oai_content'
]

[docs]def xmllint_format(xml):
    """
    Pretty-print XML like ``xmllint`` does.

    Arguments:
        xml (string): Serialized XML
    """
    log = getLogger('ocrd_models.utils.xmllint_format')
    parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True)
    document = ET.fromstring(xml, parser)
    return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
                        ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')

[docs]def handle_oai_response(response):
    """
    In case of a valid OAI-Response, extract first METS-Entry-Data
    """
    log = getLogger('ocrd_models.utils.handle_oai_response')
    content_type = response.headers['Content-Type']
    if 'xml' in content_type or 'text' in content_type:
        content = response.content
        try:
            if is_oai_content(content):
                return extract_mets_from_oai_content(content)
        except ET.LxmlError as exc:
            log.warning("textual response but no xml: %s (%s)", content, exc)
    return response.content


[docs]def is_oai_content(data):
    """
    Return True if data is an OAI-PMH request/response
    """
    log = getLogger('ocrd_models.utils.is_oai_content')
    xml_root = ET.fromstring(data)
    root_tag = xml_root.tag
    log.info("response data root.tag: '%s'" % root_tag)
    return str(root_tag).endswith('OAI-PMH')


[docs]def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'):
    """
    Extract METS from an OAI-PMH GetRecord response
    """
    xml_root = ET.fromstring(data)
    if 'mets' in xml_root.tag:
        return data
    mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
    if mets_root_el is not None:
        new_tree = ET.ElementTree(mets_root_el)
        xml_formatted = ET.tostring(new_tree,
                                pretty_print=True,
                                encoding='UTF-8').decode('UTF-8')
        formatted_content = '{}\n{}'.format(preamble, xml_formatted)
        return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')

    raise Exception("Missing mets-section in %s" % data)
Source code for ocrd_models.utils

Navigation

Related Topics