In [38]:
import os.path
from lxml import etree

datadir = "publicdata"

def localXML(filename, datafolder=".", parser=None):
    filepath = os.path.join(datafolder, filename)
    if not os.path.isfile(filepath):
        print("File not found: {}".format(filepath))
        return None
    try:
        if not parser:
            parser = etree.XMLParser(remove_blank_text=True)
        mytree = etree.parse(filepath, parser=parser)
    except Exception as e:
        return None
    
    return mytree.getroot()
In [39]:
book_root = localXML("bookstore.xml", datadir)
ind_root = localXML("ind0.xml", datadir)
widom_root = localXML("widombooks.xml", datadir)

assert book_root is not None
assert ind_root is not None
assert widom_root is not None

Quiz Question 4 Examples

First child of root

In [43]:
firstchild = book_root.getchildren()[0]
#print(firstchild)
# Alternatives?
fd = book_root.getchildren()
firstchild = fd[0]
book_root[0]
firstchild = book_root.find("book")
firstchild
Out[43]:
<Element book at 0x10eda2820>
In [44]:
# Get multipath; like "findall" for location step of last tag

xp = """/catalog/book"""
node_set = book_root.xpath(xp)
In [45]:
node_set[0]
Out[45]:
<Element book at 0x10eda2820>
In [46]:
# Get single by adding a position predicate

xp = """/catalog/book[1]"""
node_set = book_root.xpath(xp)
In [48]:
node_set
Out[48]:
[<Element book at 0x10eda2820>]

Value of attribute of first child of root

In [23]:
firstid = book_root[0].attrib["id"]
firstid
Out[23]:
'bk101'
In [52]:
# Use above XPath and then take another "step" for attribute

xp = """/catalog/book[1]/@id"""
node_set = book_root.xpath(xp)
In [53]:
node_set
Out[53]:
['bk101']

Children (tags) of first child of root

In [24]:
tag_list = []
for E in book_root.getchildren()[0]:
    tag_list.append(E.tag)
tag_list
Out[24]:
['author', 'title', 'genre', 'price', 'publish_date', 'description']
In [54]:
# Use above XPath and then take another "step" for attribute

xp = """/catalog/book[1]/*"""
node_set = book_root.xpath(xp)
In [55]:
[e.tag for e in node_set]
Out[55]:
['author', 'title', 'genre', 'price', 'publish_date', 'description']

List of prices

In [31]:
price_list = []
for bookNode in book_root.getchildren():
    price_node = bookNode.find("price")
    price_list.append(price_node.text)
price_list
Out[31]:
['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']
In [58]:
# Traversal to price element and its text

xp = """/catalog/book/price/text()"""
node_set = book_root.xpath(xp)
In [59]:
node_set
Out[59]:
['44.95',
 '5.95',
 '5.95',
 '5.95',
 '5.95',
 '4.95',
 '4.95',
 '4.95',
 '6.95',
 '36.95',
 '36.95',
 '49.95']
In [62]:
# Traversal to price element and its text -- shortcut

xp = """//text()"""
node_set = book_root.xpath(xp)
In [63]:
node_set
Out[63]:
['Gambardella, Matthew',
 "XML Developer's Guide",
 'Computer',
 '44.95',
 '2000-10-01',
 'An in-depth look at creating applications \n      with XML.',
 'Ralls, Kim',
 'Midnight Rain',
 'Fantasy',
 '5.95',
 '2000-12-16',
 'A former architect battles corporate zombies, \n      an evil sorceress, and her own childhood to become queen \n      of the world.',
 'Corets, Eva',
 'Maeve Ascendant',
 'Fantasy',
 '5.95',
 '2000-11-17',
 'After the collapse of a nanotechnology \n      society in England, the young survivors lay the \n      foundation for a new society.',
 'Corets, Eva',
 "Oberon's Legacy",
 'Fantasy',
 '5.95',
 '2001-03-10',
 'In post-apocalypse England, the mysterious \n      agent known only as Oberon helps to create a new life \n      for the inhabitants of London. Sequel to Maeve \n      Ascendant.',
 'Corets, Eva',
 'The Sundered Grail',
 'Fantasy',
 '5.95',
 '2001-09-10',
 "The two daughters of Maeve, half-sisters, \n      battle one another for control of England. Sequel to \n      Oberon's Legacy.",
 'Randall, Cynthia',
 'Lover Birds',
 'Romance',
 '4.95',
 '2000-09-02',
 'When Carla meets Paul at an ornithology \n      conference, tempers fly as feathers get ruffled.',
 'Thurman, Paula',
 'Splish Splash',
 'Romance',
 '4.95',
 '2000-11-02',
 'A deep sea diver finds true love twenty \n      thousand leagues beneath the sea.',
 'Knorr, Stefan',
 'Creepy Crawlies',
 'Horror',
 '4.95',
 '2000-12-06',
 'An anthology of horror stories about roaches,\n      centipedes, scorpions  and other insects.',
 'Kress, Peter',
 'Paradox Lost',
 'Science Fiction',
 '6.95',
 '2000-11-02',
 'After an inadvertant trip through a Heisenberg\n      Uncertainty Device, James Salway discovers the problems \n      of being quantum.',
 "O'Brien, Tim",
 'Microsoft .NET: The Programming Bible',
 'Computer',
 '36.95',
 '2000-12-09',
 "Microsoft's .NET initiative is explored in \n      detail in this deep programmer's reference.",
 "O'Brien, Tim",
 'MSXML3: A Comprehensive Guide',
 'Computer',
 '36.95',
 '2000-12-01',
 'The Microsoft MSXML3 parser is covered in \n      detail, with attention to XML DOM interfaces, XSLT processing, \n      SAX and more.',
 'Galos, Mike',
 'Visual Studio 7: A Comprehensive Guide',
 'Computer',
 '49.95',
 '2001-04-16',
 'Microsoft Visual Studio 7 is explored in depth,\n      looking at how Visual Basic, Visual C++, C#, and ASP+ are \n      integrated into a comprehensive development \n      environment.']
In [ ]: