{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Web Scraping In Class" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "nbgrader": { "grade": false, "grade_id": "cell-b9abcf27cf7faf8f", "locked": true, "schema_version": 3, "solution": false, "task": false } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import io\n", "import sys\n", "import importlib\n", "import pandas as pd\n", "from lxml import etree\n", "import requests\n", "\n", "if os.path.isdir(os.path.join(\"../../..\", \"modules\")):\n", " module_dir = os.path.join(\"../../..\", \"modules\")\n", "else:\n", " module_dir = os.path.join(\"../..\", \"modules\")\n", "\n", "module_path = os.path.abspath(module_dir)\n", "if not module_path in sys.path:\n", " sys.path.append(module_path)\n", "\n", "import util\n", "importlib.reload(util)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HTML Structure" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data Systems Basic HTML Page\n", " \n", " \n", "

First Level Heading

\n", "\n", "

Paragraph defined in body.\n", "\n", "

Second Level Heading

\n", "\n", " Link to Python documentation.\n", "

\n", "\n", " \n", " \n", "\n", "\n" ] } ], "source": [ "location = \"datasystems.denison.edu\"\n", "resource = \"/basic.html\"\n", "\n", "url = util.buildURL(resource, location)\n", "r = requests.get(url)\n", "assert r.status_code == 200\n", "\n", "print(r.text)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "ename": "XMLSyntaxError", "evalue": "Opening and ending tag mismatch: h1 line 1 and h3, line 1, column 52 (, line 1)", "output_type": "error", "traceback": [ "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n", " File \u001b[1;32m\"/Users/tcbressoud/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\"\u001b[0m, line \u001b[1;32m3343\u001b[0m, in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\n", " File \u001b[1;32m\"\"\u001b[0m, line \u001b[1;32m3\u001b[0m, in \u001b[1;35m\u001b[0m\n tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)\n", " File \u001b[1;32m\"src/lxml/etree.pyx\"\u001b[0m, line \u001b[1;32m3521\u001b[0m, in \u001b[1;35mlxml.etree.parse\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m1876\u001b[0m, in \u001b[1;35mlxml.etree._parseDocument\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m1896\u001b[0m, in \u001b[1;35mlxml.etree._parseMemoryDocument\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m1777\u001b[0m, in \u001b[1;35mlxml.etree._parseDoc\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m1082\u001b[0m, in \u001b[1;35mlxml.etree._BaseParser._parseUnicodeDoc\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m615\u001b[0m, in \u001b[1;35mlxml.etree._ParserContext._handleParseResultDoc\u001b[0m\n", " File \u001b[1;32m\"src/lxml/parser.pxi\"\u001b[0m, line \u001b[1;32m725\u001b[0m, in \u001b[1;35mlxml.etree._handleParseResult\u001b[0m\n", "\u001b[0;36m File \u001b[0;32m\"src/lxml/parser.pxi\"\u001b[0;36m, line \u001b[0;32m654\u001b[0;36m, in \u001b[0;35mlxml.etree._raiseParseError\u001b[0;36m\u001b[0m\n", "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31mXMLSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m Opening and ending tag mismatch: h1 line 1 and h3, line 1, column 52\n" ] } ], "source": [ "bad_html = \"test<body><h1>header title</h3>\"\n", "xmlparser = etree.XMLParser()\n", "#tree = etree.parse(io.StringIO(bad_html), parser=xmlparser)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "html2 = \"<html><head><title>test

header title

\"" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "tree = etree.parse(io.StringIO(html2), parser=xmlparser)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "htmlparser = etree.HTMLParser()\n", "tree = etree.parse(io.StringIO(bad_html), parser=htmlparser)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Static Web Page Example: Table\n", "\n", "http://datasystems.denison.edu/ind2016.html" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!curl -s -o ind2016.html http://datasystems.denison.edu/ind2016.html" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\t\n", "\t\t\n", "\t\t\n" ] } ], "source": [ "location = \"datasystems.denison.edu\"\n", "resource = \"/ind2016.html\"\n", "\n", "url = util.buildURL(resource, location)\n", "r = requests.get(url)\n", "assert r.status_code == 200\n", "\n", "util.print_text(r.text, nlines=5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "htmlparser = etree.HTMLParser()\n", "indroot = etree.fromstring(r.content, parser=htmlparser)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "indtree = etree.parse(io.BytesIO(r.content), parser=htmlparser)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indroot" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " \n", " \n", " ind2016 | Introduction to Data Systems\n", " \n", " \n", " \n", " \n", "\t\tWebFontConfig={google:{families:[\"Lobster\",]},timeout:3e3\n", "\t\t\n", " \n", " \n", " \n", " \n", " code\n", " country\n", " pop\n", " gdp\n", " life\n", " cell\n", " \n", " \n", " \n", " \n", " CAN\n", " Canada\n", " 36.26\n", " 1535.77\n", " 82.3\n", " 30.75\n", " \n", " \n", " CHN\n", " China\n", " 1378.66\n", " 11199.15\n", " 76.25\n", " 1364.93\n", " \n", " \n", " IND\n", " India\n", " 1324.17\n", " 2263.79\n", " 68.56\n", " 1127.81\n", " \n", " \n", " RUS\n", " Russia\n", " 144.34\n", " 1283.16\n", " 71.59\n", " 229.13\n", " \n", " \n", " USA\n", " United States\n", " 323.13\n", " 18624.47\n", " 78.69\n", " 395.88\n", " \n", " \n", " VNM\n", " Vietnam\n", " 94.57\n", " 205.28\n", " 76.25\n", " 120.6\n", " \n", " \n", "\n" ] } ], "source": [ "util.print_xml(tables[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get List of Column Names" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Obtaining Data: All Table Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Obtaining Data: Column at a Time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Static Web Page Example: Lists\n", "\n", "http://datasystems.denison.edu/ind0.html" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/tcbressoud/Dropbox/cs181-DataSystems/cs181-bressoud/f20_class/source/Web-Scraping'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "!curl -s -o ind0.html http://datasystems.denison.edu/ind0.html" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\t\n", "\t\t\n", "\t\t\n" ] } ], "source": [ "location = \"datasystems.denison.edu\"\n", "resource = \"/ind0.html\"\n", "\n", "url = util.buildURL(resource, location)\n", "r2 = requests.get(url)\n", "assert r2.status_code == 200\n", "\n", "util.print_text(r2.text, nlines=5)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }