In the Asynchronous Lecture
list
numpy
data structure.In the Synchronous Lecture
numpy
arrays.pandas
pandas
series and data frame object.If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 14 minutes
Download Jupyter Notebook used in the video.
Download the data used in the Notebook:
Download the aggregated version of the gapminder.csv data used in the video.
# Batteries included Functions
import csv # convert a .csv to a nested list
import os # library for managing our operating system.
# Where am I on my computer?
os.getcwd()
# Say I needed to change my working directory
# os.chdir("file/path/here")
# THIS IS WHERE MY DATA IS LOCATED ON MY MACHINE; THIS WILL NOT RUN ON YOUR
# COMPUTER. Change this path to point to where the gapminder data is. Above this
# code chunk you'll see a link to download the data.
= "lectures/week_05/supplementary-materials/gapminder.csv"
loc_file
# Read in the gapminder data
with open(loc_file,mode="rt") as file:
= [row for row in csv.reader(file)]
data
# %% -----------------------------------------
# Indexing Rows
# For any row > 0, row == 0 is the column names.
100]
data[
# %% -----------------------------------------
# Indexing Columns
# Referencing a column data value
= data[100] # First select the row
d 1] # Then reference the column
d[
# doing the above all in one step
100][1]
data[
# The key is to keep in mind the column names
= data.pop(0)
cnames
# We can now reference this column name list to pull out the columns we're interested in.
= cnames.index("lifeExp") # Index allows us to "look up" the location of a data value.
ind 99][ind]
data[
# %% -----------------------------------------
# Drawing out specific COLUMN of data
# identify the position
= cnames.index("lifeExp")
ind
# Looping through each row pulling out the relevant data value
= []
life_exp for row in data:
float(row[ind]))
life_exp.append(
# Same idea, but as a list comprehension
= [float(row[ind]) for row in data]
life_exp
# Make this code more flexible
= "gdpPercap"
var_name = [row[cnames.index(var_name)] for row in data]
out
out
# %% -----------------------------------------
# Numpy offers an efficiency boost, especially when indexing
import numpy as np
# Convert to a numpy array
= np.array(data)
data_np
# Column Variable we wish to access is easy using slicing.
2]
data_np[:,
# Let's compare runtimes!
# %% -----------------------------------------
%%timeit
= []
out1 for row in data:
out1.append(row[var_ind])
# %% -----------------------------------------
%%timeit
= [row[var_ind] for row in data]
out2
# %% -----------------------------------------
%%timeit
= data_np[:,var_ind] out3
numpy
import numpy as np
#### Vectors, Matrices, and N-Dimensional Arrays ####
# %% vectors (1 Dimension) -----------------------------------------
= np.array([1,2,3,4])
v
v
# %% Matrix (2 Dimensions) -----------------------------------------
= [[1,2,3,4],[2,3,4,1],[-1,1,2,1]]
NL = np.array(NL)
M
M
M.shape
# %% N-dimensional Array -----------------------------------------
# An ndimensional array is a nested list
= np.array([
A
[1,2,3,4],
[2,3,4,1],
[-1,1,2,1]
[
],
[1,2,3,4],
[2,3,4,1],
[-1,1,2,1]]
[
])
A
A.shape
# %% -----------------------------------------
###### Generating Arrays #####
# .arange
1, 10, 1 )
np.arange(
# .linspace
1,5,10)
np.linspace(
# Zeros
10)
np.zeros(
# Ones
10)
np.ones(
# Random number generation
10) # Random Number
np.random.randn(1,10,10) # Random Interger
np.random.randint(1,5,10) # Uniform distribution
np.random.uniform(1,.5,10) # Binomial (Trials)
np.random.binomial(5,1,10) # Normal
np.random.normal(5,5) # Normal
np.random.poisson(
# %% Indexing -----------------------------------------
M
M.shape
# [ROW, COLUMN]
# ":" == "all back"
# A cell
0,0]
M[
# A row
1,:]
M[
# A column
1]
M[:,
# Slicing the data structure works as it did with other python data types
0:2,0:2]
M[
# Last Column
-1]
M[:,
# Last Row
-1, : ]
M[
# Change the order by the requested postions
M2,0,1],:]
M[[
# %% Indexing Using Conditionals -----------------------------------------
= np.random.randn(10)
X
X
# Vector of Boolean values
> 0
X
# Can index using this vector
X>0]
X[X
# Logic extends to any N-dimensional Array
= np.random.randn(50).reshape(10,5)
X
X> 0]
X[X
# %% Reshaping -----------------------------------------
# Call the shape of an array
= np.random.randint(1,100,30)
v
v
v.shape
# Reshape
10,3)
v.reshape(
# Reshape has to be plausible
10,2)
v.reshape(
# %% Reassignment -----------------------------------------
= np.zeros(50).reshape(10,5)
X
X
# Reassign data values by referencing positions
0,0] = 999
X[
X
# Reassign whole ranges of values
0,:] = 999
X[
X
0] = 999
X[:,
X
# Reassignment using boolean values.
= np.random.randn(50).reshape(10,5).round(1)
D
D
> 0
D
> 0] = 1
D[D <= 0] = 0
D[D
D
# Using where "ifelse()-like" method
= np.random.randn(50).reshape(10,5).round(1) # Generate some random numbers again
D # Before
D >0,1,0) # After np.where(D
These exercises are designed to help you reinforce your grasp of the concepts covered in the asynchronous lecture material.
Convert the following loop into a list comprehension.
= []
bind for i in range(10):
for j in "georgetown":
if j != "g":
bind.append((i,j))print(bind)
= [(i,j) for i in range(10) for j in "georgetown" if j != "g"]
bind print(bind)
Save the following lines of text to your Desktop as a .txt
file named zen_of_python.txt
.
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
# Store the text as a string
= """
txt The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
"""
# Define the relevant file path to your Desktop
= ""
file_path
# Open connection, write lines, then close.
with open(file_path + "zen_of_python.txt",mode="wt",encoding="utf-8") as file:
file.writelines(txt)
Using the following data, write a function called select()
that takes the nested list data and a variable name as input and returns the requested variable as a single list
. Make sure the function can deal with cases when a variable that is not in the data is requested (e.g. the variable name is misspelled). Make sure you include a docstring with your function.
= [
data "Var1","Var2","Var3"],
[1,"Apples",True],
[4,"Horses",None],
[-1,"Small Birds",False],
[ ]
def select(data,variable):
"""Function selects a column variable using a specified
variable name from data organized as a nested list.
Args:
data (list): data structure organized as a nested list.
variable (str): Name of the variable being selected.
Returns:
list: list of containing the requested data column.
"""
= data.pop(0)
cnames if variable in cnames:
= [row[cnames.index(variable)] for row in data]
out return out
# Test
print(select(data,"Var2"))
## ['Apples', 'Horses', 'Small Birds']
The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com