Web Scraping In Class

In [3]:
import os
import io
import sys
import importlib
import pandas as pd
from lxml import etree
import requests

if os.path.isdir(os.path.join("../../..", "modules")):
    module_dir = os.path.join("../../..", "modules")
else:
    module_dir = os.path.join("../..", "modules")

module_path = os.path.abspath(module_dir)
if not module_path in sys.path:
    sys.path.append(module_path)

import util
importlib.reload(util)
Out[3]:
<module 'util' from '/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/modules/util.py'>

HTML Structure

In [2]:
location = "datasystems.denison.edu"
resource = "/basic.html"

url = util.buildURL(resource, location)
r = requests.get(url)
assert r.status_code == 200

print(r.text)
<!DOCTYPE html>
<html lang="en">
  <head>
    <title>Data Systems Basic HTML Page</title>
  </head>
  <body>
    <h1>First Level Heading</h1>

    <p>Paragraph defined in <b>body</b>.

    <h2>Second Level Heading</h2>

    <a href="http://docs.python.org">Link</a> to Python documentation.
    </p>

    <ul>
      <li>Item 1
      <ol>
        <li>Item 1 nested</li>
        <li>Item 2 nested</li>
      </ol>
      </li>
      <li>Item 2</li>
      <li>Item 3</li>
    </ul>
  </body>
</html>

In [18]:
bad_html = "<html><head><title>test<body><h1>header title</h3>"
xmlparser  = etree.XMLParser()
#tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)
Traceback (most recent call last):

  File "/Users/tcbressoud/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-18-952062f08bb2>", line 3, in <module>
    tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)

  File "src/lxml/etree.pyx", line 3521, in lxml.etree.parse

  File "src/lxml/parser.pxi", line 1876, in lxml.etree._parseDocument

  File "src/lxml/parser.pxi", line 1896, in lxml.etree._parseMemoryDocument

  File "src/lxml/parser.pxi", line 1777, in lxml.etree._parseDoc

  File "src/lxml/parser.pxi", line 1082, in lxml.etree._BaseParser._parseUnicodeDoc

  File "src/lxml/parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc

  File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult

  File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError

  File "<string>", line 1
XMLSyntaxError: Opening and ending tag mismatch: h1 line 1 and h3, line 1, column 52
In [21]:
html2 = "<html><head><title>test</title></head><body><h1>header title</h1></body></html>"
In [22]:
tree = etree.parse(io.StringIO(html2), parser=xmlparser)
In [20]:
htmlparser =  etree.HTMLParser()
tree = etree.parse(io.StringIO(bad_html), parser=htmlparser)

Static Web Page Example: Table

http://datasystems.denison.edu/ind2016.html

In [6]:
os.getcwd()
Out[6]:
'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'
In [4]:
!curl -s -o ind2016.html http://datasystems.denison.edu/ind2016.html
In [7]:
location = "datasystems.denison.edu"
resource = "/ind2016.html"

url = util.buildURL(resource, location)
r = requests.get(url)
assert r.status_code == 200

util.print_text(r.text, nlines=5)
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	<head>
		<meta charset="utf-8" />
		<meta http-equiv="X-UA-Compatible" content="IE=edge" />
In [8]:
htmlparser = etree.HTMLParser()
indroot = etree.fromstring(r.content, parser=htmlparser)
In [11]:
indtree = etree.parse(io.BytesIO(r.content), parser=htmlparser)
In [9]:
indroot
Out[9]:
<Element html at 0x118ed2e60>
In [13]:
util.print_xml(indroot, nlines=20)
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" la
  <head>
    <meta charset="utf-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
    <title>ind2016 | Introduction to Data Systems</title>
    <meta name="robots" content="all"/>
    <meta name="generator" content="Sandvox 2.10.12"/>
    <meta name="viewport" content="width=772"/>
    <link rel="shortcut icon" type="image/x-icon" href="fav
    <link rel="canonical" href="http://datasystems.denison.
    <script>
		WebFontConfig={google:{families:["Lobster",]},timeout:3e3
		</script>
    <link rel="stylesheet" type="text/css" href="sandvox_Ap
    <!--
		Photo credits for this website's design: <http://datasyst
		Licensing for this website's design:     <http://datasyst
		-->
  </head>
  <body class="sandvox has-page-title allow-sidebar no-cust
In [14]:
tables = indroot.xpath("//table")
In [15]:
len(tables)
Out[15]:
1
In [16]:
util.print_xml(tables[0])
<table class="table table-bordered table-hover table-conden
  <thead>
    <tr>
      <th title="Field #1">code</th>
      <th title="Field #2">country</th>
      <th title="Field #3">pop</th>
      <th title="Field #4">gdp</th>
      <th title="Field #5">life</th>
      <th title="Field #6">cell</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>CAN</td>
      <td>Canada</td>
      <td align="right">36.26</td>
      <td align="right">1535.77</td>
      <td align="right">82.3</td>
      <td align="right">30.75</td>
    </tr>
    <tr>
      <td>CHN</td>
      <td>China</td>
      <td align="right">1378.66</td>
      <td align="right">11199.15</td>
      <td align="right">76.25</td>
      <td align="right">1364.93</td>
    </tr>
    <tr>
      <td>IND</td>
      <td>India</td>
      <td align="right">1324.17</td>
      <td align="right">2263.79</td>
      <td align="right">68.56</td>
      <td align="right">1127.81</td>
    </tr>
    <tr>
      <td>RUS</td>
      <td>Russia</td>
      <td align="right">144.34</td>
      <td align="right">1283.16</td>
      <td align="right">71.59</td>
      <td align="right">229.13</td>
    </tr>
    <tr>
      <td>USA</td>
      <td>United States</td>
      <td align="right">323.13</td>
      <td align="right">18624.47</td>
      <td align="right">78.69</td>
      <td align="right">395.88</td>
    </tr>
    <tr>
      <td>VNM</td>
      <td>Vietnam</td>
      <td align="right">94.57</td>
      <td align="right">205.28</td>
      <td align="right">76.25</td>
      <td align="right">120.6</td>
    </tr>
  </tbody>
</table>

Get List of Column Names

In [ ]:
 

Obtaining Data: All Table Data

In [ ]:
 

Obtaining Data: Column at a Time

In [ ]:
 

Static Web Page Example: Lists

http://datasystems.denison.edu/ind0.html

In [6]:
os.getcwd()
Out[6]:
'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'
In [23]:
!curl -s -o ind0.html http://datasystems.denison.edu/ind0.html
In [24]:
location = "datasystems.denison.edu"
resource = "/ind0.html"

url = util.buildURL(resource, location)
r2 = requests.get(url)
assert r2.status_code == 200

util.print_text(r2.text, nlines=5)
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	<head>
		<meta charset="utf-8" />
		<meta http-equiv="X-UA-Compatible" content="IE=edge" />
In [36]: