In [9]:
import os.path
from lxml import etree

datadir = "publicdata"

def localXML(filename, datafolder=".", parser=None):
    filepath = os.path.join(datafolder, filename)
    if not os.path.isfile(filepath):
        print("File not found: {}".format(filepath))
        return None
    try:
        if not parser:
            parser = etree.XMLParser(remove_blank_text=True)
        mytree = etree.parse(filepath, parser=parser)
    except Exception as e:
        return None
    
    return mytree.getroot()
In [13]:
book_root = localXML("bookstore.xml", datadir)
ind_root = localXML("ind0.xml", datadir)
widom_root = localXML("widombooks.xml", datadir)

assert book_root is not None
assert ind_root is not None
assert widom_root is not None

Quiz Question 4 Examples

First child of root

In [15]:
firstchild = book_root.getchildren()[0]
print(firstchild)
# Alternatives?
<Element book at 0x10ed89dc0>
In [17]:
# Get multipath; like "findall" for location step of last tag

xp = """??"""
node_set = book_root.xpath(xp)
In [19]:
node_set[0]
Out[19]:
[<Element book at 0x10ed89dc0>,
 <Element book at 0x10ece50f0>,
 <Element book at 0x10ed8c9b0>,
 <Element book at 0x10ed8c690>,
 <Element book at 0x10ed8cb90>,
 <Element book at 0x10ed8caf0>,
 <Element book at 0x10ed8cc80>,
 <Element book at 0x10ed8ccd0>,
 <Element book at 0x10ed8cd70>,
 <Element book at 0x10ed8ce10>,
 <Element book at 0x10ed8ce60>,
 <Element book at 0x10ed8cf00>]
In [17]:
# Get single by adding a position predicate

xp = """??"""
node_set = book_root.xpath(xp)
In [19]:
node_set[0]
Out[19]:
[<Element book at 0x10ed89dc0>,
 <Element book at 0x10ece50f0>,
 <Element book at 0x10ed8c9b0>,
 <Element book at 0x10ed8c690>,
 <Element book at 0x10ed8cb90>,
 <Element book at 0x10ed8caf0>,
 <Element book at 0x10ed8cc80>,
 <Element book at 0x10ed8ccd0>,
 <Element book at 0x10ed8cd70>,
 <Element book at 0x10ed8ce10>,
 <Element book at 0x10ed8ce60>,
 <Element book at 0x10ed8cf00>]

Value of attribute of first child of root

In [23]:
firstid = book_root[0].attrib["id"]
firstid
Out[23]:
'bk101'
In [20]:
# Use above XPath and then take another "step" for attribute

xp = """??"""
node_set = book_root.xpath(xp)
In [21]:
node_set[0]
Out[21]:
'bk101'

Children (tags) of first child of root

In [24]:
tag_list = []
for E in book_root.getchildren()[0]:
    tag_list.append(E.tag)
tag_list
Out[24]:
['author', 'title', 'genre', 'price', 'publish_date', 'description']
In [28]:
# Use above XPath and then take another "step" for attribute

xp = """??"""
node_set = book_root.xpath(xp)
In [30]:
[e.tag for e in node_set]
Out[30]:
['author', 'title', 'genre', 'price', 'publish_date', 'description']

List of prices

In [31]:
price_list = []
for bookNode in book_root.getchildren():
    price_node = bookNode.find("price")
    price_list.append(price_node.text)
price_list
Out[31]:
['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']
In [34]:
# Traversal to price element and its text

xp = """??"""
node_set = book_root.xpath(xp)
In [36]:
node_set
Out[36]:
['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']
In [34]:
# Traversal to price element and its text -- shortcut

xp = """??"""
node_set = book_root.xpath(xp)
In [36]:
node_set
Out[36]:
['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']
In [ ]: