Web Scraping In Class¶

import os
import io
import sys
import importlib
import pandas as pd
from lxml import etree
import requests

if os.path.isdir(os.path.join("../../..", "modules")):
    module_dir = os.path.join("../../..", "modules")
else:
    module_dir = os.path.join("../..", "modules")

module_path = os.path.abspath(module_dir)
if not module_path in sys.path:
    sys.path.append(module_path)

import util
importlib.reload(util)

<module 'util' from '/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/modules/util.py'>

HTML Structure¶

location = "datasystems.denison.edu"
resource = "/basic.html"

url = util.buildURL(resource, location)
r = requests.get(url)
assert r.status_code == 200

print(r.text)

<!DOCTYPE html>
<html lang="en">
  <head>
    <title>Data Systems Basic HTML Page</title>
  </head>
  <body>
    <h1>First Level Heading</h1>

    <p>Paragraph defined in <b>body</b>.

    <h2>Second Level Heading</h2>

    <a href="http://docs.python.org">Link</a> to Python documentation.
    </p>

    <ul>
      <li>Item 1
      <ol>
        <li>Item 1 nested</li>
        <li>Item 2 nested</li>
      </ol>
      </li>
      <li>Item 2</li>
      <li>Item 3</li>
    </ul>
  </body>
</html>

bad_html = "<html><head><title>test<body><h1>header title</h3>"
xmlparser  = etree.XMLParser()
#tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)

Traceback (most recent call last):

  File "/Users/tcbressoud/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-18-952062f08bb2>", line 3, in <module>
    tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)

  File "src/lxml/etree.pyx", line 3521, in lxml.etree.parse

  File "src/lxml/parser.pxi", line 1876, in lxml.etree._parseDocument

  File "src/lxml/parser.pxi", line 1896, in lxml.etree._parseMemoryDocument

  File "src/lxml/parser.pxi", line 1777, in lxml.etree._parseDoc

  File "src/lxml/parser.pxi", line 1082, in lxml.etree._BaseParser._parseUnicodeDoc

  File "src/lxml/parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc

  File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult

  File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError

  File "<string>", line 1
XMLSyntaxError: Opening and ending tag mismatch: h1 line 1 and h3, line 1, column 52

html2 = "<html><head><title>test</title></head><body><h1>header title</h1></body></html>"

tree = etree.parse(io.StringIO(html2), parser=xmlparser)

htmlparser =  etree.HTMLParser()
tree = etree.parse(io.StringIO(bad_html), parser=htmlparser)

Static Web Page Example: Table¶

http://datasystems.denison.edu/ind2016.html

os.getcwd()

'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'

!curl -s -o ind2016.html http://datasystems.denison.edu/ind2016.html

location = "datasystems.denison.edu"
resource = "/ind2016.html"

url = util.buildURL(resource, location)
r = requests.get(url)
assert r.status_code == 200

util.print_text(r.text, nlines=5)

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	<head>
		<meta charset="utf-8" />
		<meta http-equiv="X-UA-Compatible" content="IE=edge" />

htmlparser = etree.HTMLParser()
indroot = etree.fromstring(r.content, parser=htmlparser)

indtree = etree.parse(io.BytesIO(r.content), parser=htmlparser)

indroot

<Element html at 0x118ed2e60>

util.print_xml(indroot, nlines=20)

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" la
  <head>
    <meta charset="utf-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
    <title>ind2016 | Introduction to Data Systems</title>
    <meta name="robots" content="all"/>
    <meta name="generator" content="Sandvox 2.10.12"/>
    <meta name="viewport" content="width=772"/>
    <link rel="shortcut icon" type="image/x-icon" href="fav
    <link rel="canonical" href="http://datasystems.denison.
    <script>
		WebFontConfig={google:{families:["Lobster",]},timeout:3e3
		</script>
    <link rel="stylesheet" type="text/css" href="sandvox_Ap
    <!--
		Photo credits for this website's design: <http://datasyst
		Licensing for this website's design:     <http://datasyst
		-->
  </head>
  <body class="sandvox has-page-title allow-sidebar no-cust

tables = indroot.xpath("//table")

len(tables)

1

util.print_xml(tables[0])

<table class="table table-bordered table-hover table-conden
  <thead>
    <tr>
      <th title="Field #1">code</th>
      <th title="Field #2">country</th>
      <th title="Field #3">pop</th>
      <th title="Field #4">gdp</th>
      <th title="Field #5">life</th>
      <th title="Field #6">cell</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>CAN</td>
      <td>Canada</td>
      <td align="right">36.26</td>
      <td align="right">1535.77</td>
      <td align="right">82.3</td>
      <td align="right">30.75</td>
    </tr>
    <tr>
      <td>CHN</td>
      <td>China</td>
      <td align="right">1378.66</td>
      <td align="right">11199.15</td>
      <td align="right">76.25</td>
      <td align="right">1364.93</td>
    </tr>
    <tr>
      <td>IND</td>
      <td>India</td>
      <td align="right">1324.17</td>
      <td align="right">2263.79</td>
      <td align="right">68.56</td>
      <td align="right">1127.81</td>
    </tr>
    <tr>
      <td>RUS</td>
      <td>Russia</td>
      <td align="right">144.34</td>
      <td align="right">1283.16</td>
      <td align="right">71.59</td>
      <td align="right">229.13</td>
    </tr>
    <tr>
      <td>USA</td>
      <td>United States</td>
      <td align="right">323.13</td>
      <td align="right">18624.47</td>
      <td align="right">78.69</td>
      <td align="right">395.88</td>
    </tr>
    <tr>
      <td>VNM</td>
      <td>Vietnam</td>
      <td align="right">94.57</td>
      <td align="right">205.28</td>
      <td align="right">76.25</td>
      <td align="right">120.6</td>
    </tr>
  </tbody>
</table>

Get List of Column Names¶

Obtaining Data: All Table Data¶

Obtaining Data: Column at a Time¶

Static Web Page Example: Lists¶

http://datasystems.denison.edu/ind0.html

os.getcwd()

'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'

!curl -s -o ind0.html http://datasystems.denison.edu/ind0.html

location = "datasystems.denison.edu"
resource = "/ind0.html"

url = util.buildURL(resource, location)
r2 = requests.get(url)
assert r2.status_code == 200

util.print_text(r2.text, nlines=5)

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	<head>
		<meta charset="utf-8" />
		<meta http-equiv="X-UA-Compatible" content="IE=edge" />