#!/usr/bin/env python3
"""xmlparser.py
Parse a NASA Ames PAH IR Spectroscopic Database XML library file, with
or without schema checking.
"""
import array
import base64
from binascii import crc32
from typing import Optional, Union
from urllib.error import HTTPError, URLError
import urllib.request
from lxml import etree # type: ignore
[docs]
class XMLparser:
"""
Parse a NASA Ames PAH IR Spectroscopic library XML-file.
Optional behavior includes validating against a schema.
Attributes:
filename (str): XML filename.
validate (bool): Whether to validate the XML schema or not.
library (dict): Dictionary containing the parsed data.
Examples:
parser = XMLparser(filename="xml_file.xml")
parser.verify_schema()
dict = parser.to_pahdb_dict()
parser = XMLparser(filename="xml_file.xml", validate=True)
dict = parser.to_pahdb_dict()
parser = XMLparser()
parser.filename = "xml_file.xml"
parser.validate = True
dict = parser.to_pahdb_dict()
"""
def __init__(self, filename: Optional[str] = None, validate: bool = False) -> None:
"""
Inits XMLparser with schema checking off, no given filename.
"""
self.filename = filename
self.validate = validate
self.library: dict = dict()
def __repr__(self) -> str:
"""
Class representation.
"""
return f"{self.__class__.__name__}(" f"{self.filename=})"
def __str__(self) -> str:
"""
A description of the instance.
"""
return f"AmesPAHdbPythonSuite XML parser.\n" f"XML filename: {self.filename=}."
[docs]
def verify_schema(self) -> bool:
"""
Validate against linked schema.
Note:
Requires that self.filename is set.
It sets the internal attributes:
_tree (etree.ElementTree): parsed XML file.
_root: root element of ElementTree tree.
Returns:
True if successful, otherwise False.
"""
self._tree = etree.parse(self.filename)
self._root = self._tree.getroot()
schema = self._root.get(
"{http://www.w3.org/2001/XMLSchema-instance}" + "schemaLocation"
)
if schema:
_, uri = schema.split(" ", 1)
try:
response = urllib.request.urlopen(uri, timeout=3.0)
except (HTTPError, URLError):
# TODO For now, fallback to True if we can't get a schema, use False instead?
return True
doc = etree.parse(response)
xmlschema = etree.XMLSchema(doc)
try:
xmlschema.assertValid(self._tree)
except Exception as e:
raise e
else:
return True
return True
[docs]
def to_pahdb_dict(self, validate: bool = False) -> dict:
"""
Parses the XML, with or without validation.
Args:
validate (bool). Defaults to self.valdiate value, but can be
overridden.
Note:
Sets the attribute self.library when successful.
Returns: library (dict): Dictionary, with the UIDs as keys,
containing the transitions, geometry data, as well as UID
metadata, references, comments, and laboratory.
"""
if self.validate or validate:
self.verify_schema()
self._context = etree.iterwalk(self._tree, events=("start", "end"))
else:
self._context = etree.iterparse(self.filename, events=("start", "end"))
self.library = self._tree_to_pahdb_dict()
return self.library
def _tree_to_pahdb_dict(self) -> dict:
"""
Convert the element tree to a a pahdb_dict.
Returns: library: Dictionary, with the UIDs as keys,
containing the transitions, geometry data, as well as UID
metadata, references, comments, and laboratory.
"""
while True:
action, elem = next(self._context)
tag = etree.QName(elem).localname
if action == "start":
if tag == "species":
self.library["species"] = self._species_handler(self._context)
elif tag == "pahdatabase":
self.library.update(elem.attrib)
elif action == "end":
if tag == "comment":
self.library["comment"] = elem.text
elif tag == "pahdatabase":
break
elem.clear()
return self.library
def _species_handler(self, context: Union[etree.iterwalk, etree.iterparse]) -> dict:
"""
Parse a PAHdb XML <species> tag.
"""
species = dict()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action == "start" and tag == "specie":
if self.library["database"] != "clusters/theoretical":
uid = int(elem.attrib["uid"])
else:
uid = crc32(
"-".join(
[
elem.attrib[key]
for key in ["monomers", "type", "conformation"]
]
).encode("utf-8")
)
species[uid] = self._specie_handler(context)
if self.library["database"] == "clusters/theoretical":
species[uid].update(elem.attrib)
if len(species[uid]["monomers"].split(",")) == 1:
species[uid]["monomers"] = int(species[uid]["monomers"])
elif action != "end":
continue
if tag == "species":
break
elem.clear()
return species
def _specie_handler(self, context: Union[etree.iterwalk, etree.iterparse]) -> dict:
"""
Parse a PAHdb XML <specie> tag.
"""
def specie_geometry_handler(
context: Union[etree.iterwalk, etree.iterparse]
) -> list:
"""<specie> tag: Parse its child <geometry> tag."""
geometry = list()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if tag == "atom" and action == "start":
atom_dict: dict = dict()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action != "end":
continue
if tag == "atom":
geometry.append(atom_dict)
break
atom_dict[tag] = float(elem.text)
elem.clear()
elif action != "end":
continue
if tag == "geometry":
break
elem.clear()
return geometry
def specie_transitions_handler(
context: Union[etree.iterwalk, etree.iterparse]
) -> list:
"""
<specie> tag: Parse its child <transitions> tag.
"""
transitions = list()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if tag == "mode":
mode_dict: dict = dict()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action != "end":
continue
if tag == "mode":
transitions.append(mode_dict)
break
if elem.attrib:
for attr, text in elem.attrib.items():
try:
value = float(text)
except ValueError:
value = text
mode_dict[attr] = value
try:
value = float(elem.text)
except ValueError:
value = elem.text
mode_dict[tag] = value
elem.clear()
elif action != "end":
continue
if tag == "transitions":
break
elem.clear()
return transitions
def specie_laboratory_handler(
context: Union[etree.iterwalk, etree.iterparse]
) -> dict:
"""
<specie> tag: Parse its child <laboratory> tag.
"""
laboratory = dict()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action == "end":
if tag == "frequency" or tag == "intensity":
bin = base64.b64decode(elem.text)
laboratory[tag] = array.array("f", bin)
elif tag == "laboratory":
break
elem.clear()
return laboratory
specie_dict: dict = {
"comments": list(),
"references": list(),
"geometry": list(),
"transitions": list(),
"laboratory": dict(),
}
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action == "start":
if tag == "comments":
comments = list()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action == "end":
if tag == "comment":
comments.append(elem.text)
elif tag == "comments":
break
elem.clear()
specie_dict["comments"] = comments
elif tag == "references":
references = list()
while True:
action, elem = next(context)
tag = etree.QName(elem).localname
if action == "end":
if tag == "reference":
references.append(elem.text)
elif tag == "references":
break
elem.clear()
specie_dict["references"] = references
elif tag == "geometry":
specie_dict["geometry"] = specie_geometry_handler(context)
elif tag == "transitions":
specie_dict["transitions"] = specie_transitions_handler(context)
elif tag == "laboratory":
specie_dict["laboratory"] = specie_laboratory_handler(context)
elif action == "end":
if tag == "specie":
atom_numbers = {
"n_c": 6,
"n_h": 1,
"n_n": 7,
"n_o": 8,
"n_mg": 12,
"n_si": 14,
"n_fe": 26,
}
specie_dict.update({k: 0 for k in atom_numbers.keys()})
for atom, number in atom_numbers.items():
specie_dict[atom] = len(
[
sub["type"]
for sub in specie_dict["geometry"]
if sub["type"] == number
]
)
break
try:
value = float(elem.text)
if value % 1 == 0:
value = int(value)
except ValueError:
value = elem.text
specie_dict[tag] = value
elem.clear()
return specie_dict