In the Asynchronous Lecture
In the Synchronous Lecture
SQLite
connection in PythonSQLite
queriesSQLite
.If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
In the synchronous lecture, we’ll be discussing writing SQLite queries. I’ll be using a simple SQL GUI to demonstrate different types of queries. If you wish to follow along in class, please install the DB Browser for SQLite on your machine prior to class.
The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 7 minutes
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
# BBC Url that we'll scrape.
= "https://www.bbc.com/news/world-us-canada-54238936"
url = requests.get(url)
page # 200 == Connection
page.status_code
# We've downloaded the entire website
page.content
# Parse the content
= BeautifulSoup(page.content, 'html.parser')
soup
# Let's look at the raw code of the downloaded website
print(soup.prettify())
# With the above in hand, we can find all instances of a tag at once.
'p') # Here I'm locating all the paragraph tags
soup.find_all(
# We can then convert the tag to text
'p')[15].get_text()
soup.find_all(
# Using a list comprehension we can do this for each paragraph tag
= [i.get_text() for i in soup.find_all('p')]
content1
content1
# As we can see, we get things that we want and some things that we don't want.
# So let's be more specific in targeting specific content.
# Can use a css selector to target specific content
# #main-content > div.ssrcss-1ocoo3l-Wrap.e42f8511 > div > div.ssrcss-rgov1k-MainColumn.e1sbfw0p0 > article > div:nth-child(3) > div > p > b
= [i.get_text() for i in soup.select("article > div > div")]
story_content
# NOTE: Secret here is to remove the :nth-child(3) from div:nth-child(3). Think of :nth-child(3) as the
# index on the div tag. div:nth-child(3) says give me all items at position 3, whereas div give me all items at
# this location.
# Join together as a single string
= "\n".join(story_content)
story_text print(story_text)
# Great! Now let's target different information like the headline and the date.
# Date CSS
#main-content > div.ssrcss-1ocoo3l-Wrap.e42f8511 > div > div.ssrcss-rgov1k-MainColumn.e1sbfw0p0 > article > header > div:nth-child(2) > dl > div > dd > span > time
= "article > header > div:nth-child(2) > dl > div > dd > span > time"
css_loc = soup.select(css_loc)[0].get_text()
story_date
# Get story head line
= soup.find_all("h1")[0].get_text()
story_headline
# Gather together
= [story_headline,story_date,story_text]
entry entry
import pandas as pd
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
import time # To put the system to sleep
import random # for random numbers
# Building a scraper
# The idea here is to just wrap the above in a function.
# Input: url
# Output: relevant content
def bbc_scraper(url=None):
# Download the webpage
= requests.get(url)
page
# If a connection was reached
if page.status_code == 200:
# Parse
= BeautifulSoup(page.content, 'html.parser')
soup
# Pull Headline
= soup.find_all("h1")[0].get_text()
story_headline
# Pull Date
= soup.select("article > header > div:nth-child(2) > dl > div > dd > span > time")[0].get_text()
story_date
# Pull story content
= [i.get_text() for i in soup.select("article > div > div")]
story_content = " ".join(story_content)
story_text
# Return data
return [story_headline,story_date,story_text]
# Extract one webpage
"https://www.bbc.com/news/world-us-canada-54238936")
bbc_scraper(
# Now loop through urls of relevant stories
# Let's collect urls on all the relevant news stories of the day.
= ["https://www.bbc.com/news/world-us-canada-54238936",
urls "https://www.bbc.com/news/world-us-canada-54254141",
"https://www.bbc.com/news/world-us-canada-54229799",
"https://www.bbc.com/news/world-us-canada-54244515"]
#Then just loop through and collect
= []
scraped_data for url in urls:
# Scrape the content
scraped_data.append(bbc_scraper(url))
# Put the system to sleep for a random draw of time (be kind)
.5,3))
time.sleep(random.uniform(
# Look at the data object
scraped_data
# Organize as a pandas data frame
= pd.DataFrame(scraped_data,columns=["headline","date","content"])
dat "scraped_web_data.csv",index=False)
dat.to_csv(
# %% -----------------------------------------
# How to locate URLS?
= "https://www.bbc.com/news"
main_bbc_page_url = requests.get(main_bbc_page_url)
main_page
main_page.status_code= BeautifulSoup(main_page.content,'html.parser')
main_soup
= main_soup.find_all("a")[10]
tag "href")
tag.attrs.get(
# Extract relevant links
= set()
links for tag in main_soup.find_all("a"):
= tag.attrs.get("href")
href if "world-us-canada" in href and "https:" not in href:
"https://www.bbc.com" + href])
links.update([
links
# %% -----------------------------------------
# Let's write the above as a single function
def link_scrape(urls=None,sleep=3):
"""Scrape multiple BBC URLS.
Args:
urls (list): list of valid BBC news urls.
sleep (int): Integer value specifying how long the machine should be
put to sleep (random uniform). Defaults to 3.
Returns:
DataFrame: frame containing headline, date, and content fields
"""
= []
scraped_data for url in urls:
print(url) # Keep track of where we are at.
try:
# Scrape the content This will break on URLs that we haven't
# accounted for the structure on. So we'll use a try and except
# clause so our code continues even though it breaks on some urls.
scraped_data.append(bbc_scraper(url))except:
print("URL doesn't work with scraper")
# Put the system to sleep for a random draw of time (be kind)
0,sleep))
time.sleep(random.uniform(
= pd.DataFrame(scraped_data,columns=["headline","date","content"])
dat return dat
= link_scrape(urls=links)
dat_content
dat_content
# More advanced approaches to scraping the web.
# https://scrapy.org/
.pdf
/.docx
Download the documents being scraped in this video:
# !pip3 install PyPDF2
# !pip3 install python-docx
import PyPDF2 # Scraping PDFs
import docx # Scraping Word Documents
import warnings
"ignore")
warnings.filterwarnings(
# %% -----------------------------------------
#############################################
########## Scraping TEXT in a PDF ###########
#############################################
# Mercy Corp Report As example
# https://www.mercycorps.org/sites/default/files/2019-11/Motivations%20and%20Empty%20Promises_Mercy%20Corps_Full%20Report_0.pdf
= open('mercy_corp.pdf', 'rb')
pdfFileObj = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader
pdfReader.numPages
pdfReader.isEncrypted= pdfReader.getPage(10)
pageObj print(pageObj.extractText())
pdfFileObj.close()
# %% -----------------------------------------
# Extract all the text content in the PDF
def read_pdf(file):
with open(file, 'rb') as pdfFileObj:
# Open the pdf
= PyPDF2.PdfFileReader(pdfFileObj)
pdfReader
# Locate the number of pages
= pdfReader.numPages
n_pages
# Loop through the pages and store the content by
# appending to a string
= ""
content for i in range(n_pages):
+= pdfReader.getPage(i).extractText()
content
return content
# Examine the content
= read_pdf("mercy_corp.pdf")
mc_content print(mc_content)
# %% -----------------------------------------
# CAUTION! Not all PDFs are equal. Some are really difficult to parse.
# Given that the spaces aren't special characters.
= read_pdf("thomas_wood.pdf")
tw_content print(tw_content)
# When you run into this, you'll have to think through an alternative parsing strategy. No free lunch here.
# %% -----------------------------------------
################################################
########## Scraping TEXT in WORD doc ###########
################################################
= docx.Document("Easterly_and_Levine.docx")
doc dir(doc)
len(doc.paragraphs)
print(doc.paragraphs[0].text)
for i in doc.paragraphs:
print(i.text)
# %% -----------------------------------------
# Wrap into a function
def get_word(filename):
= docx.Document(filename)
doc = []
fullText for para in doc.paragraphs:
fullText.append(para.text)return '\n'.join(fullText)
print(get_word("Easterly_and_Levine.docx"))
These exercises are designed to help you reinforce your grasp of the concepts covered in the asynchronous lecture material.
For the following question, let’s use this Wikipedia page to practice some of the webscraping concepts covered in the asynchronous lecture.
= "https://en.wikipedia.org/wiki/Machine_learning" wiki_url
Download the website of the Wikipedia article and parse the webcontent.
import requests
from bs4 import BeautifulSoup
= requests.get(wiki_url)
wiki = BeautifulSoup(wiki.content, "html.parser") wiki_parsed
Scrape the title and subtitles from the Wikipedia article.
# Scrape the main title of the article
= wiki_parsed.find_all("h1")[0].get_text()
article_title
# Scrape the subtitles of the article
= [h.get_text() for h in wiki_parsed.find_all(class_="mw-headline")] article_subtitles
Scrape the text content from the Wikipedia article. Make sure the content is collapsed into a single character string.
= [p.get_text() for p in wiki_parsed.find_all("p")]
content = "\n".join(content)
text print(text)
The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com