In [7]:
import requests
In [17]:
import os
import os.path
import sys
import importlib
import pandas as pd

if os.path.isdir(os.path.join("../../..", "modules")):
    module_dir = os.path.join("../../..", "modules")
else:
    module_dir = os.path.join("../..", "modules")

module_path = os.path.abspath(module_dir)
if not module_path in sys.path:
    sys.path.append(module_path)

import mysocket as sock
importlib.reload(sock)

import util
importlib.reload(util)

datadir = "publicdata"
In [18]:
import io
In [19]:
s = """year,sex,name,count
2014,Female,Emma,20936
2014,Male,Noah,19305
2015,Female,Emma,20455
2015,Male,Noah,19635
2016,Female,Emma,19496
2016,Male,Noah,19117
2017,Female,Emma,19800
2017,Male,Liam,18798
2018,Female,Emma,18688
2018,Male,Liam,19837
"""
In [20]:
print(s)
year,sex,name,count
2014,Female,Emma,20936
2014,Male,Noah,19305
2015,Female,Emma,20455
2015,Male,Noah,19635
2016,Female,Emma,19496
2016,Male,Noah,19117
2017,Female,Emma,19800
2017,Male,Liam,18798
2018,Female,Emma,18688
2018,Male,Liam,19837

In [21]:
print(repr(s))
'year,sex,name,count\n2014,Female,Emma,20936\n2014,Male,Noah,19305\n2015,Female,Emma,20455\n2015,Male,Noah,19635\n2016,Female,Emma,19496\n2016,Male,Noah,19117\n2017,Female,Emma,19800\n2017,Male,Liam,18798\n2018,Female,Emma,18688\n2018,Male,Liam,19837\n'

File Based Processing

Native Data Structure

In [25]:
pathname = os.path.join(datadir, "tn10.csv")

fileObj = open(pathname, "rt")
columns = fileObj.readline().strip().split(',')
print(columns)

LoL = []
for row in fileObj:
    fields = row.strip().split(',')
    LoL.append(fields)
    
print(LoL)
fileObj.close()

df = pd.DataFrame(LoL, columns=columns)
df
['year', 'sex', 'name', 'count']
[['2014', 'Female', 'Emma', '20936'], ['2014', 'Male', 'Noah', '19305'], ['2015', 'Female', 'Emma', '20455'], ['2015', 'Male', 'Noah', '19635'], ['2016', 'Female', 'Emma', '19496'], ['2016', 'Male', 'Noah', '19117'], ['2017', 'Female', 'Emma', '19800'], ['2017', 'Male', 'Liam', '18798'], ['2018', 'Female', 'Emma', '18688'], ['2018', 'Male', 'Liam', '19837']]
Out[25]:
year sex name count
0 2014 Female Emma 20936
1 2014 Male Noah 19305
2 2015 Female Emma 20455
3 2015 Male Noah 19635
4 2016 Female Emma 19496
5 2016 Male Noah 19117
6 2017 Female Emma 19800
7 2017 Male Liam 18798
8 2018 Female Emma 18688
9 2018 Male Liam 19837

Pandas read_csv

In [26]:
df2 = pd.read_csv(pathname)

with open(pathname, 'rt') as fileObj:
    df3 = pd.read_csv(fileObj)

df3
Out[26]:
year sex name count
0 2014 Female Emma 20936
1 2014 Male Noah 19305
2 2015 Female Emma 20455
3 2015 Male Noah 19635
4 2016 Female Emma 19496
5 2016 Male Noah 19117
6 2017 Female Emma 19800
7 2017 Male Liam 18798
8 2018 Female Emma 18688
9 2018 Male Liam 19837
In [28]:
# df4 = pd.read_csv(s)

Goal: Given data is in a string, how to create two-D structure or data frame

What to do with a string

Request

In [30]:
csvurl = util.buildURL("/data/ind2016_16.csv", "datasystems.denison.edu")
response = requests.get(csvurl)
if response.status_code != 200:
    print("Error acquiring file")
In [34]:
response.text
Out[34]:
'\x00c\x00o\x00d\x00e\x00,\x00c\x00o\x00u\x00n\x00t\x00r\x00y\x00,\x00p\x00o\x00p\x00,\x00g\x00d\x00p\x00,\x00l\x00i\x00f\x00e\x00,\x00c\x00e\x00l\x00l\x00\n\x00C\x00A\x00N\x00,\x00C\x00a\x00n\x00a\x00d\x00a\x00,\x003\x006\x00.\x002\x006\x00,\x001\x005\x003\x005\x00.\x007\x007\x00,\x008\x002\x00.\x003\x00,\x003\x000\x00.\x007\x005\x00\n\x00C\x00H\x00N\x00,\x00C\x00h\x00i\x00n\x00a\x00,\x001\x003\x007\x008\x00.\x006\x006\x00,\x001\x001\x001\x009\x009\x00.\x001\x005\x00,\x007\x006\x00.\x002\x005\x00,\x001\x003\x006\x004\x00.\x009\x003\x00\n\x00I\x00N\x00D\x00,\x00I\x00n\x00d\x00i\x00a\x00,\x001\x003\x002\x004\x00.\x001\x007\x00,\x002\x002\x006\x003\x00.\x007\x009\x00,\x006\x008\x00.\x005\x006\x00,\x001\x001\x002\x007\x00.\x008\x001\x00\n\x00R\x00U\x00S\x00,\x00R\x00u\x00s\x00s\x00i\x00a\x00,\x001\x004\x004\x00.\x003\x004\x00,\x001\x002\x008\x003\x00.\x001\x006\x00,\x007\x001\x00.\x005\x009\x00,\x002\x002\x009\x00.\x001\x003\x00\n\x00U\x00S\x00A\x00,\x00U\x00n\x00i\x00t\x00e\x00d\x00 \x00S\x00t\x00a\x00t\x00e\x00s\x00,\x003\x002\x003\x00.\x001\x003\x00,\x001\x008\x006\x002\x004\x00.\x004\x007\x00,\x007\x008\x00.\x006\x009\x00,\x003\x009\x005\x00.\x008\x008\x00\n\x00V\x00N\x00M\x00,\x00V\x00i\x00e\x00t\x00n\x00a\x00m\x00,\x009\x004\x00.\x005\x007\x00,\x002\x000\x005\x00.\x002\x008\x00,\x007\x006\x00.\x002\x005\x00,\x001\x002\x000\x00.\x006\x00\n'

How to make "right"

In [35]:
 
Out[35]:
'code,country,pop,gdp,life,cell\nCAN,Canada,36.26,1535.77,82.3,30.75\nCHN,China,1378.66,11199.15,76.25,1364.93\nIND,India,1324.17,2263.79,68.56,1127.81\nRUS,Russia,144.34,1283.16,71.59,229.13\nUSA,United States,323.13,18624.47,78.69,395.88\nVNM,Vietnam,94.57,205.28,76.25,120.6\n'
In [ ]: