Source code for amespahdbpythonsuite.xmlparser

#!/usr/bin/env python3
"""xmlparser.py

Parse a NASA Ames PAH IR Spectroscopic Database XML library file, with
or without schema checking.

"""

import array
import base64
from binascii import crc32
from typing import Optional, Union
from urllib.error import HTTPError, URLError
import urllib.request

from lxml import etree  # type: ignore


[docs] class XMLparser: """ Parse a NASA Ames PAH IR Spectroscopic library XML-file. Optional behavior includes validating against a schema. Attributes: filename (str): XML filename. validate (bool): Whether to validate the XML schema or not. library (dict): Dictionary containing the parsed data. Examples: parser = XMLparser(filename="xml_file.xml") parser.verify_schema() dict = parser.to_pahdb_dict() parser = XMLparser(filename="xml_file.xml", validate=True) dict = parser.to_pahdb_dict() parser = XMLparser() parser.filename = "xml_file.xml" parser.validate = True dict = parser.to_pahdb_dict() """ def __init__(self, filename: Optional[str] = None, validate: bool = False) -> None: """ Inits XMLparser with schema checking off, no given filename. """ self.filename = filename self.validate = validate self.library: dict = dict() def __repr__(self) -> str: """ Class representation. """ return f"{self.__class__.__name__}(" f"{self.filename=})" def __str__(self) -> str: """ A description of the instance. """ return f"AmesPAHdbPythonSuite XML parser.\n" f"XML filename: {self.filename=}."
[docs] def verify_schema(self) -> bool: """ Validate against linked schema. Note: Requires that self.filename is set. It sets the internal attributes: _tree (etree.ElementTree): parsed XML file. _root: root element of ElementTree tree. Returns: True if successful, otherwise False. """ self._tree = etree.parse(self.filename) self._root = self._tree.getroot() schema = self._root.get( "{http://www.w3.org/2001/XMLSchema-instance}" + "schemaLocation" ) if schema: _, uri = schema.split(" ", 1) try: response = urllib.request.urlopen(uri, timeout=3.0) except (HTTPError, URLError): # TODO For now, fallback to True if we can't get a schema, use False instead? return True doc = etree.parse(response) xmlschema = etree.XMLSchema(doc) try: xmlschema.assertValid(self._tree) except Exception as e: raise e else: return True return True
[docs] def to_pahdb_dict(self, validate: bool = False) -> dict: """ Parses the XML, with or without validation. Args: validate (bool). Defaults to self.valdiate value, but can be overridden. Note: Sets the attribute self.library when successful. Returns: library (dict): Dictionary, with the UIDs as keys, containing the transitions, geometry data, as well as UID metadata, references, comments, and laboratory. """ if self.validate or validate: self.verify_schema() self._context = etree.iterwalk(self._tree, events=("start", "end")) else: self._context = etree.iterparse(self.filename, events=("start", "end")) self.library = self._tree_to_pahdb_dict() return self.library
def _tree_to_pahdb_dict(self) -> dict: """ Convert the element tree to a a pahdb_dict. Returns: library: Dictionary, with the UIDs as keys, containing the transitions, geometry data, as well as UID metadata, references, comments, and laboratory. """ while True: action, elem = next(self._context) tag = etree.QName(elem).localname if action == "start": if tag == "species": self.library["species"] = self._species_handler(self._context) elif tag == "pahdatabase": self.library.update(elem.attrib) elif action == "end": if tag == "comment": self.library["comment"] = elem.text elif tag == "pahdatabase": break elem.clear() return self.library def _species_handler(self, context: Union[etree.iterwalk, etree.iterparse]) -> dict: """ Parse a PAHdb XML <species> tag. """ species = dict() while True: action, elem = next(context) tag = etree.QName(elem).localname if action == "start" and tag == "specie": if self.library["database"] != "clusters/theoretical": uid = int(elem.attrib["uid"]) else: uid = crc32( "-".join( [ elem.attrib[key] for key in ["monomers", "type", "conformation"] ] ).encode("utf-8") ) species[uid] = self._specie_handler(context) if self.library["database"] == "clusters/theoretical": species[uid].update(elem.attrib) if len(species[uid]["monomers"].split(",")) == 1: species[uid]["monomers"] = int(species[uid]["monomers"]) elif action != "end": continue if tag == "species": break elem.clear() return species def _specie_handler(self, context: Union[etree.iterwalk, etree.iterparse]) -> dict: """ Parse a PAHdb XML <specie> tag. """ def specie_geometry_handler( context: Union[etree.iterwalk, etree.iterparse] ) -> list: """<specie> tag: Parse its child <geometry> tag.""" geometry = list() while True: action, elem = next(context) tag = etree.QName(elem).localname if tag == "atom" and action == "start": atom_dict: dict = dict() while True: action, elem = next(context) tag = etree.QName(elem).localname if action != "end": continue if tag == "atom": geometry.append(atom_dict) break atom_dict[tag] = float(elem.text) elem.clear() elif action != "end": continue if tag == "geometry": break elem.clear() return geometry def specie_transitions_handler( context: Union[etree.iterwalk, etree.iterparse] ) -> list: """ <specie> tag: Parse its child <transitions> tag. """ transitions = list() while True: action, elem = next(context) tag = etree.QName(elem).localname if tag == "mode": mode_dict: dict = dict() while True: action, elem = next(context) tag = etree.QName(elem).localname if action != "end": continue if tag == "mode": transitions.append(mode_dict) break if elem.attrib: for attr, text in elem.attrib.items(): try: value = float(text) except ValueError: value = text mode_dict[attr] = value try: value = float(elem.text) except ValueError: value = elem.text mode_dict[tag] = value elem.clear() elif action != "end": continue if tag == "transitions": break elem.clear() return transitions def specie_laboratory_handler( context: Union[etree.iterwalk, etree.iterparse] ) -> dict: """ <specie> tag: Parse its child <laboratory> tag. """ laboratory = dict() while True: action, elem = next(context) tag = etree.QName(elem).localname if action == "end": if tag == "frequency" or tag == "intensity": bin = base64.b64decode(elem.text) laboratory[tag] = array.array("f", bin) elif tag == "laboratory": break elem.clear() return laboratory specie_dict: dict = { "comments": list(), "references": list(), "geometry": list(), "transitions": list(), "laboratory": dict(), } while True: action, elem = next(context) tag = etree.QName(elem).localname if action == "start": if tag == "comments": comments = list() while True: action, elem = next(context) tag = etree.QName(elem).localname if action == "end": if tag == "comment": comments.append(elem.text) elif tag == "comments": break elem.clear() specie_dict["comments"] = comments elif tag == "references": references = list() while True: action, elem = next(context) tag = etree.QName(elem).localname if action == "end": if tag == "reference": references.append(elem.text) elif tag == "references": break elem.clear() specie_dict["references"] = references elif tag == "geometry": specie_dict["geometry"] = specie_geometry_handler(context) elif tag == "transitions": specie_dict["transitions"] = specie_transitions_handler(context) elif tag == "laboratory": specie_dict["laboratory"] = specie_laboratory_handler(context) elif action == "end": if tag == "specie": atom_numbers = { "n_c": 6, "n_h": 1, "n_n": 7, "n_o": 8, "n_mg": 12, "n_si": 14, "n_fe": 26, } specie_dict.update({k: 0 for k in atom_numbers.keys()}) for atom, number in atom_numbers.items(): specie_dict[atom] = len( [ sub["type"] for sub in specie_dict["geometry"] if sub["type"] == number ] ) break try: value = float(elem.text) if value % 1 == 0: value = int(value) except ValueError: value = elem.text specie_dict[tag] = value elem.clear() return specie_dict