Before you turn this problem in, make sure everything runs as expected. This is a combination of restarting the kernel and then running all cells (in the menubar, select Kernel$\rightarrow$Restart And Run All).
Make sure you fill in any place that says YOUR CODE HERE
or "YOUR ANSWER HERE".
import os
import os.path
import pandas as pd
datadir = "publicdata"
Q1 Assuming that path
refers to a CSV file that has the same format of $x$ rows of data, with one header line of year,sex,name,count
and data lines with those same four fields, write a function
readTopNamesDoL(path)
that reads the file and creates a DoL representation and returns that dictionary from the function.
# YOUR CODE HERE
raise NotImplementedError()
tn10 = readTopNamesDoL(os.path.join(datadir, "tn10.csv"))
print(tn10)
tn10 = readTopNamesDoL(os.path.join(datadir, "tn10.csv"))
assert isinstance(tn10, dict)
assert len(tn10) == 4
assert 'year' in tn10
assert 'sex' in tn10
assert 'count' in tn10
assert len(tn10['year']) == 10
# hiddent tests here
assert True
Q2 Write a function
filterTopNamesDoL(tnDoL, threshold)
to create a filter copy of a topnames DoL tnDoL
(with columns year
, sex
, name
, count
) so that only rows with a count value greater than or equal to threshold
are present in the newly created DoL. Your function should return the new and filtered DoL.
# YOUR CODE HERE
raise NotImplementedError()
tn10 = readTopNamesDoL(os.path.join(datadir, "tn10.csv"))
tn = readTopNamesDoL(os.path.join(datadir, "topnames.csv"))
tn10_filter = filterTopNamesDoL(tn10, 19000)
print(tn10_filter)
topnames = {'year': [2018, 2018, 2017, 2017, 2016, 2016],
'sex': ['Male', 'Female', 'Male',
'Female', 'Male', 'Female'],
'name': ['Liam', 'Emma', 'Liam', 'Emma',
'Noah', 'Emma'],
'count': [19837, 18688, 18798, 19800, 19117, 19496]}
filtered = filterTopNamesDoL(topnames, 19000)
assert len(filtered['count']) == 4
assert len(filtered['year']) == 4
assert len(filtered['name']) == 4
assert len(filtered['sex']) == 4
# Tests are hidden to allow def of correct read function
Q3 Write a function
addCatColumnDoL(tnDoL, threshold1, threshold2)
that adds a categorical column to a DoL representation in parameter tnDoL
with the new column named category
whose values are the strings "small"
when count is below threshold1
, is "medium"
when count is greater than or equal to threshold1
and less than threshold2
, and large
when count is greater than or equal to threshold2
. This change to tnDoL
happens in place, rather than creating a new dictionary, and so nothing is returned from the function.
# YOUR CODE HERE
raise NotImplementedError()
topnames = {'year': [2018, 2018, 2017, 2017, 2016, 2016],
'sex': ['Male', 'Female', 'Male',
'Female', 'Male', 'Female'],
'name': ['Liam', 'Emma', 'Liam', 'Emma',
'Noah', 'Emma'],
'count': [19837, 18688, 18798, 19800, 19117, 19496]}
addCatColumnDoL(topnames, 19000, 19500)
print(topnames)
topnames = {'year': [2018, 2018, 2017, 2017, 2016, 2016],
'sex': ['Male', 'Female', 'Male',
'Female', 'Male', 'Female'],
'name': ['Liam', 'Emma', 'Liam', 'Emma',
'Noah', 'Emma'],
'count': [19837, 18688, 18798, 19800, 19117, 19496]}
addCatColumnDoL(topnames, 19000, 19500)
assert 'category' in topnames
assert 'small' in topnames['category']
assert 'medium' in topnames['category']
assert 'large' in topnames['category']
assert len(topnames['category']) == len(topnames['year'])
Q4 Write a function
dropColumnDoL(DoL, columnname)
that drops the column specified by columnname
from the dictionary of lists representation given in DoL
. This should be done "in place". If columnname
does not refer to one of the columns in DoL
, the function should simply return.
# YOUR CODE HERE
raise NotImplementedError()
topnames = {'year': [2018, 2018, 2017, 2017, 2016, 2016],
'sex': ['Male', 'Female', 'Male',
'Female', 'Male', 'Female'],
'name': ['Liam', 'Emma', 'Liam', 'Emma',
'Noah', 'Emma'],
'count': [19837, 18688, 18798, 19800, 19117, 19496]}
dropColumnDoL(topnames, 'sex')
assert 'year' in topnames
assert 'name' in topnames
assert 'count' in topnames
assert 'sex' not in topnames
topnames = {'year': [2018, 2018, 2017, 2017, 2016, 2016],
'sex': ['Male', 'Female', 'Male',
'Female', 'Male', 'Female'],
'name': ['Liam', 'Emma', 'Liam', 'Emma',
'Noah', 'Emma'],
'count': [19837, 18688, 18798, 19800, 19117, 19496]}
dropColumnDoL(topnames, 'foo')
assert 'year' in topnames
assert 'name' in topnames
assert 'count' in topnames
assert 'sex' in topnames
Q5 Assuming a CSV that has the same format of $x$ rows of data, with one header line and data lines with the same four fields, write a function
readTopNamesLoL(path)
that reads the file and creates a LoL representation and returns both the list of column names and the list of lists structure from the function.
# YOUR CODE HERE
raise NotImplementedError()
tn10columns, tn10data = readTopNamesLoL(os.path.join(datadir, "tn10.csv"))
print(tn10columns)
print(tn10data)
tn10columns, tn10data = readTopNamesLoL(os.path.join(datadir, "tn10.csv"))
assert isinstance(tn10data, list)
assert len(tn10data) == 10
assert 'year' in tn10columns
assert 'sex' in tn10columns
assert 'count' in tn10columns
assert len(tn10data[0]) == 4
# hiddent tests here
assert True
Q6 Write a function
filterTopNamesLoL(tnLoL, threshold)
to filter a topnames LoL tnLoL
(with columns year
, sex
, name
, count
) so that only rows with a count value greater than or equal to threshold
are present in the newly created LoL. Note that you are creating a new LoL with the filtered data, and not modifying tnLoL
in place. Your function should return the new LoL.
# YOUR CODE HERE
raise NotImplementedError()
tn10columns, tn10data = readTopNamesLoL(os.path.join(datadir, "tn10.csv"))
tncolumns,tndata = readTopNamesLoL(os.path.join(datadir, "topnames.csv"))
tn_filter = filterTopNamesLoL(tn10data, 19000)
print(tn_filter)
topnames = [[2018, 'Male', 'Liam', 19837],
[2018, 'Female', 'Emma', 18688],
[2017, 'Male', 'Liam', 18798],
[2017, 'Female', 'Emma', 19800],
[2016, 'Male', 'Noah', 19117],
[2016, 'Female', 'Emma', 19496]]
columns = ['year', 'sex', 'name', 'count']
filtered = filterTopNamesLoL(topnames, 19000)
assert len(filtered) == 4
# Tests are hidden to allow def of correct read function
Q (HW) Write a function
addCatColumnLoL(tnLoL, threshold1, threshold2)
that adds a categorical column to a LoL representation in parameter tnDoL
with the new column named category
whose values are the strings "small"
when count is below threshold1
, is "medium"
when count is greater than or equal to threshold1
and less than threshold2
, and large
when count is greater than or equal to threshold2
. The function should perform its modifications in place.
Q (HW) Write a function
dropColumnLoL(LoL, columns, columnname)
that drops the column specified by columnname
from the list of lists representation given in LoL
. This should be done "in place" and should not assume the topnames columns, but rather use the list of columns specified in columns
to determine which column to drop.