Module 27 - Web Scraping
Web scraping is the process of automatically extracting data from websites. Python's BeautifulSoup and requests libraries make this easy and efficient.
1. Introduction to Web Scraping
When to Scrape
✅ Public data collection
✅ Price monitoring
✅ Research and analysis
✅ Content aggregation
Legal and Ethical Considerations
⚠️ Always check:
- Website's
robots.txtfile - Terms of Service
- Copyright and data usage rights
- Rate limiting to avoid overloading servers
Ethics
Respect website owners, follow robots.txt, don't overload servers, and use data responsibly.
2. Installation
pip install beautifulsoup4 requests lxml
3. Basic Web Scraping
3.1 Fetching HTML Content
import requests
from bs4 import BeautifulSoup
# Fetch webpage
url = 'https://example.com'
response = requests.get(url)
# Check if successful
if response.status_code == 200:
html_content = response.text
print(html_content[:500]) # First 500 characters
else:
print(f"Error: {response.status_code}")
3.2 Parsing HTML with BeautifulSoup
from bs4 import BeautifulSoup
html = """
<html>
<head><title>Sample Page</title></head>
<body>
<h1>Welcome</h1>
<p class="intro">This is a paragraph.</p>
<div id="content">
<p>First paragraph in div.</p>
<p>Second paragraph in div.</p>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html, 'html.parser')
# Get title
title = soup.title.string
print(f"Title: {title}")
# Find first <p> tag
first_p = soup.find('p')
print(f"First paragraph: {first_p.text}")
# Find all <p> tags
all_p = soup.find_all('p')
for p in all_p:
print(p.text)
4. Navigating HTML Structure
4.1 Finding Elements
from bs4 import BeautifulSoup
import requests
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# By tag name
h1 = soup.find('h1')
all_divs = soup.find_all('div')
# By class
intro = soup.find(class_='intro')
all_items = soup.find_all(class_='item')
# By ID
content = soup.find(id='content')
# By attribute
links = soup.find_all('a', href=True)
# CSS selectors
items = soup.select('.item') # Class
content = soup.select('#content') # ID
links = soup.select('div > a') # Direct child
4.2 Extracting Data
# Get text content
text = soup.find('p').text
text_stripped = soup.find('p').get_text(strip=True)
# Get attributes
link = soup.find('a')
href = link['href']
href_alt = link.get('href')
# Get all links
links = soup.find_all('a')
for link in links:
print(f"Text: {link.text}, URL: {link.get('href')}")
5. Practical Examples
5.1 Scraping News Headlines
import requests
from bs4 import BeautifulSoup
def scrape_headlines(url):
"""Scrape news headlines from a webpage"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all headline elements (adjust selectors based on site structure)
headlines = soup.find_all('h2', class_='headline')
results = []
for headline in headlines:
title = headline.text.strip()
link = headline.find('a')
url = link['href'] if link else None
results.append({'title': title, 'url': url})
return results
# Usage
# headlines = scrape_headlines('https://news-site.com')
# for item in headlines:
# print(f"{item['title']}: {item['url']}")
5.2 Scraping Product Information
import requests
from bs4 import BeautifulSoup
def scrape_products(url):
"""Scrape product information"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
products = []
# Find all product containers
items = soup.find_all('div', class_='product')
for item in items:
product = {
'name': item.find('h3', class_='product-name').text.strip(),
'price': item.find('span', class_='price').text.strip(),
'rating': item.find('span', class_='rating').text.strip(),
'url': item.find('a')['href']
}
products.append(product)
return products
5.3 Scraping Table Data
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_table(url):
"""Scrape HTML table into pandas DataFrame"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find table
table = soup.find('table')
# Extract headers
headers = []
for th in table.find_all('th'):
headers.append(th.text.strip())
# Extract rows
rows = []
for tr in table.find_all('tr')[1:]: # Skip header row
cells = [td.text.strip() for td in tr.find_all('td')]
rows.append(cells)
# Create DataFrame
df = pd.DataFrame(rows, columns=headers)
return df
# Usage
# df = scrape_table('https://example.com/table')
# print(df.head())
6. Handling Dynamic Content
Using Selenium for JavaScript-heavy Sites
pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup Chrome driver
driver = webdriver.Chrome()
try:
# Navigate to page
driver.get('https://example.com')
# Wait for element to load
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'content'))
)
# Extract data
title = driver.find_element(By.TAG_NAME, 'h1').text
print(f"Title: {title}")
# Get page source for BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
finally:
driver.quit()
7. Advanced Techniques
7.1 Handling Pagination
import requests
from bs4 import BeautifulSoup
import time
def scrape_all_pages(base_url, max_pages=10):
"""Scrape data across multiple pages"""
all_data = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data from current page
items = soup.find_all('div', class_='item')
all_data.extend(items)
# Be polite - wait between requests
time.sleep(1)
# Check if there's a next page
next_button = soup.find('a', class_='next')
if not next_button:
break
return all_data
7.2 User-Agent and Headers
import requests
# Custom headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
}
response = requests.get('https://example.com', headers=headers)
7.3 Session Management (for Login)
import requests
from bs4 import BeautifulSoup
# Create session
session = requests.Session()
# Login
login_url = 'https://example.com/login'
login_data = {
'username': 'your_username',
'password': 'your_password'
}
session.post(login_url, data=login_data)
# Access protected page
response = session.get('https://example.com/protected')
soup = BeautifulSoup(response.content, 'html.parser')
8. Error Handling and Robustness
import requests
from bs4 import BeautifulSoup
import time
from requests.exceptions import RequestException
def safe_scrape(url, max_retries=3):
"""Scrape with error handling and retries"""
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
return soup
except RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
print(f"Failed to scrape {url} after {max_retries} attempts")
return None
9. Saving Scraped Data
import json
import csv
import pandas as pd
# Save to JSON
data = [{'name': 'Item 1', 'price': 19.99}, {'name': 'Item 2', 'price': 29.99}]
with open('data.json', 'w') as f:
json.dump(data, f, indent=2)
# Save to CSV
with open('data.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'price'])
writer.writeheader()
writer.writerows(data)
# Save to DataFrame then CSV
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
10. Complete Example: Price Monitor
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
class PriceMonitor:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def get_price(self, url):
"""Extract price from product page"""
try:
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Adjust selector based on actual website structure
price_element = soup.find('span', class_='price')
if price_element:
price_text = price_element.text.strip()
# Extract numeric value
price = float(price_text.replace('$', '').replace(',', ''))
return price
except Exception as e:
print(f"Error: {e}")
return None
def monitor_products(self, products, interval=3600):
"""Monitor multiple products"""
results = []
for name, url in products.items():
print(f"Checking {name}...")
price = self.get_price(url)
results.append({
'timestamp': datetime.now(),
'product': name,
'price': price,
'url': url
})
time.sleep(2) # Be polite
# Save results
df = pd.DataFrame(results)
df.to_csv('price_history.csv', mode='a', header=False, index=False)
return results
# Usage
# monitor = PriceMonitor()
# products = {
# 'Laptop': 'https://example.com/laptop',
# 'Phone': 'https://example.com/phone'
# }
# monitor.monitor_products(products)
Summary
✅ BeautifulSoup parses HTML and XML documents
✅ Use requests to fetch webpage content
✅ Extract data using tags, classes, IDs, and CSS selectors
✅ Selenium handles JavaScript-heavy sites
✅ Always respect robots.txt and rate limits
✅ Implement error handling and retries
Next Steps
In Module 28, you'll learn:
- Flask web framework basics
- Creating routes and templates
- Building simple web applications
- Handling forms and user input
Practice Exercises
- Scrape all article titles from a news website's homepage
- Extract product information (name, price, rating) from an e-commerce site
- Build a job listing aggregator from multiple job boards
- Create a weather data scraper that saves daily forecasts
- Develop a price comparison tool for multiple online stores
Challenge
Create a comprehensive web scraping framework that:
- Supports multiple websites with different structures
- Implements polite crawling (respects robots.txt, rate limits)
- Handles pagination and infinite scroll
- Detects and adapts to website structure changes
- Stores data in a database with deduplication
- Sends alerts for significant changes (e.g., price drops)
- Includes logging and error recovery