In [38]:
import os.path
from lxml import etree

datadir = "publicdata"

def localXML(filename, datafolder=".", parser=None):
    filepath = os.path.join(datafolder, filename)
    if not os.path.isfile(filepath):
        print("File not found: {}".format(filepath))
        return None
    try:
        if not parser:
            parser = etree.XMLParser(remove_blank_text=True)
        mytree = etree.parse(filepath, parser=parser)
    except Exception as e:
        return None
    
    return mytree.getroot()

In [39]:
book_root = localXML("bookstore.xml", datadir)
ind_root = localXML("ind0.xml", datadir)
widom_root = localXML("widombooks.xml", datadir)

assert book_root is not None
assert ind_root is not None
assert widom_root is not None

## Quiz Question 4 Examples

### First child of root

In [43]:
firstchild = book_root.getchildren()[0]
#print(firstchild)
# Alternatives?
fd = book_root.getchildren()
firstchild = fd[0]
book_root[0]
firstchild = book_root.find("book")
firstchild

<Element book at 0x10eda2820>

In [44]:
# Get multipath; like "findall" for location step of last tag

xp = """/catalog/book"""
node_set = book_root.xpath(xp)

In [45]:
node_set[0]

<Element book at 0x10eda2820>

In [46]:
# Get single by adding a position predicate

xp = """/catalog/book[1]"""
node_set = book_root.xpath(xp)

In [48]:
node_set

[<Element book at 0x10eda2820>]

### Value of attribute of first child of root

In [23]:
firstid = book_root[0].attrib["id"]
firstid

'bk101'

In [52]:
# Use above XPath and then take another "step" for attribute

xp = """/catalog/book[1]/@id"""
node_set = book_root.xpath(xp)

In [53]:
node_set

['bk101']

### Children (tags) of first child of root

In [24]:
tag_list = []
for E in book_root.getchildren()[0]:
    tag_list.append(E.tag)
tag_list

['author', 'title', 'genre', 'price', 'publish_date', 'description']

In [54]:
# Use above XPath and then take another "step" for attribute

xp = """/catalog/book[1]/*"""
node_set = book_root.xpath(xp)

In [55]:
[e.tag for e in node_set]

['author', 'title', 'genre', 'price', 'publish_date', 'description']

### List of prices

In [31]:
price_list = []
for bookNode in book_root.getchildren():
    price_node = bookNode.find("price")
    price_list.append(price_node.text)
price_list

['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']

In [58]:
# Traversal to price element and its text

xp = """/catalog/book/price/text()"""
node_set = book_root.xpath(xp)

In [59]:
node_set

['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']

In [62]:
# Traversal to price element and its text -- shortcut

xp = """//text()"""
node_set = book_root.xpath(xp)

In [63]:
node_set

['Gambardella, Matthew',
 "XML Developer's Guide",
 'Computer',
 '44.95',
 '2000-10-01',
 'An in-depth look at creating applications \n      with XML.',
 'Ralls, Kim',
 'Midnight Rain',
 'Fantasy',
 '5.95',
 '2000-12-16',
 'A former architect battles corporate zombies, \n      an evil sorceress, and her own childhood to become queen \n      of the world.',
 'Corets, Eva',
 'Maeve Ascendant',
 'Fantasy',
 '5.95',
 '2000-11-17',
 'After the collapse of a nanotechnology \n      society in England, the young survivors lay the \n      foundation for a new society.',
 'Corets, Eva',
 "Oberon's Legacy",
 'Fantasy',
 '5.95',
 '2001-03-10',
 'In post-apocalypse England, the mysterious \n      agent known only as Oberon helps to create a new life \n      for the inhabitants of London. Sequel to Maeve \n      Ascendant.',
 'Corets, Eva',
 'The Sundered Grail',
 'Fantasy',
 '5.95',
 '2001-09-10',
 "The two daughters of Maeve, half-sisters, \n      battle one another for control of England. Sequel 