Skip to main content

Module 27 - Web Scraping

Web scraping is the process of automatically extracting data from websites. Python's BeautifulSoup and requests libraries make this easy and efficient.


1. Introduction to Web Scraping

When to Scrape

✅ Public data collection
✅ Price monitoring
✅ Research and analysis
✅ Content aggregation

⚠️ Always check:

  • Website's robots.txt file
  • Terms of Service
  • Copyright and data usage rights
  • Rate limiting to avoid overloading servers
Ethics

Respect website owners, follow robots.txt, don't overload servers, and use data responsibly.


2. Installation

pip install beautifulsoup4 requests lxml

3. Basic Web Scraping

3.1 Fetching HTML Content

import requests
from bs4 import BeautifulSoup

# Fetch webpage
url = 'https://example.com'
response = requests.get(url)

# Check if successful
if response.status_code == 200:
html_content = response.text
print(html_content[:500]) # First 500 characters
else:
print(f"Error: {response.status_code}")

3.2 Parsing HTML with BeautifulSoup

from bs4 import BeautifulSoup

html = """
<html>
<head><title>Sample Page</title></head>
<body>
<h1>Welcome</h1>
<p class="intro">This is a paragraph.</p>
<div id="content">
<p>First paragraph in div.</p>
<p>Second paragraph in div.</p>
</div>
</body>
</html>
"""

soup = BeautifulSoup(html, 'html.parser')

# Get title
title = soup.title.string
print(f"Title: {title}")

# Find first <p> tag
first_p = soup.find('p')
print(f"First paragraph: {first_p.text}")

# Find all <p> tags
all_p = soup.find_all('p')
for p in all_p:
print(p.text)

4. Navigating HTML Structure

4.1 Finding Elements

from bs4 import BeautifulSoup
import requests

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# By tag name
h1 = soup.find('h1')
all_divs = soup.find_all('div')

# By class
intro = soup.find(class_='intro')
all_items = soup.find_all(class_='item')

# By ID
content = soup.find(id='content')

# By attribute
links = soup.find_all('a', href=True)

# CSS selectors
items = soup.select('.item') # Class
content = soup.select('#content') # ID
links = soup.select('div > a') # Direct child

4.2 Extracting Data

# Get text content
text = soup.find('p').text
text_stripped = soup.find('p').get_text(strip=True)

# Get attributes
link = soup.find('a')
href = link['href']
href_alt = link.get('href')

# Get all links
links = soup.find_all('a')
for link in links:
print(f"Text: {link.text}, URL: {link.get('href')}")

5. Practical Examples

5.1 Scraping News Headlines

import requests
from bs4 import BeautifulSoup

def scrape_headlines(url):
"""Scrape news headlines from a webpage"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all headline elements (adjust selectors based on site structure)
headlines = soup.find_all('h2', class_='headline')

results = []
for headline in headlines:
title = headline.text.strip()
link = headline.find('a')
url = link['href'] if link else None
results.append({'title': title, 'url': url})

return results

# Usage
# headlines = scrape_headlines('https://news-site.com')
# for item in headlines:
# print(f"{item['title']}: {item['url']}")

5.2 Scraping Product Information

import requests
from bs4 import BeautifulSoup

def scrape_products(url):
"""Scrape product information"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

products = []

# Find all product containers
items = soup.find_all('div', class_='product')

for item in items:
product = {
'name': item.find('h3', class_='product-name').text.strip(),
'price': item.find('span', class_='price').text.strip(),
'rating': item.find('span', class_='rating').text.strip(),
'url': item.find('a')['href']
}
products.append(product)

return products

5.3 Scraping Table Data

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_table(url):
"""Scrape HTML table into pandas DataFrame"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find table
table = soup.find('table')

# Extract headers
headers = []
for th in table.find_all('th'):
headers.append(th.text.strip())

# Extract rows
rows = []
for tr in table.find_all('tr')[1:]: # Skip header row
cells = [td.text.strip() for td in tr.find_all('td')]
rows.append(cells)

# Create DataFrame
df = pd.DataFrame(rows, columns=headers)
return df

# Usage
# df = scrape_table('https://example.com/table')
# print(df.head())

6. Handling Dynamic Content

Using Selenium for JavaScript-heavy Sites

pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Setup Chrome driver
driver = webdriver.Chrome()

try:
# Navigate to page
driver.get('https://example.com')

# Wait for element to load
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'content'))
)

# Extract data
title = driver.find_element(By.TAG_NAME, 'h1').text
print(f"Title: {title}")

# Get page source for BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

finally:
driver.quit()

7. Advanced Techniques

7.1 Handling Pagination

import requests
from bs4 import BeautifulSoup
import time

def scrape_all_pages(base_url, max_pages=10):
"""Scrape data across multiple pages"""
all_data = []

for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract data from current page
items = soup.find_all('div', class_='item')
all_data.extend(items)

# Be polite - wait between requests
time.sleep(1)

# Check if there's a next page
next_button = soup.find('a', class_='next')
if not next_button:
break

return all_data

7.2 User-Agent and Headers

import requests

# Custom headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
}

response = requests.get('https://example.com', headers=headers)

7.3 Session Management (for Login)

import requests
from bs4 import BeautifulSoup

# Create session
session = requests.Session()

# Login
login_url = 'https://example.com/login'
login_data = {
'username': 'your_username',
'password': 'your_password'
}

session.post(login_url, data=login_data)

# Access protected page
response = session.get('https://example.com/protected')
soup = BeautifulSoup(response.content, 'html.parser')

8. Error Handling and Robustness

import requests
from bs4 import BeautifulSoup
import time
from requests.exceptions import RequestException

def safe_scrape(url, max_retries=3):
"""Scrape with error handling and retries"""
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')
return soup

except RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
print(f"Failed to scrape {url} after {max_retries} attempts")
return None

9. Saving Scraped Data

import json
import csv
import pandas as pd

# Save to JSON
data = [{'name': 'Item 1', 'price': 19.99}, {'name': 'Item 2', 'price': 29.99}]

with open('data.json', 'w') as f:
json.dump(data, f, indent=2)

# Save to CSV
with open('data.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'price'])
writer.writeheader()
writer.writerows(data)

# Save to DataFrame then CSV
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)

10. Complete Example: Price Monitor

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

class PriceMonitor:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def get_price(self, url):
"""Extract price from product page"""
try:
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Adjust selector based on actual website structure
price_element = soup.find('span', class_='price')
if price_element:
price_text = price_element.text.strip()
# Extract numeric value
price = float(price_text.replace('$', '').replace(',', ''))
return price
except Exception as e:
print(f"Error: {e}")
return None

def monitor_products(self, products, interval=3600):
"""Monitor multiple products"""
results = []

for name, url in products.items():
print(f"Checking {name}...")
price = self.get_price(url)

results.append({
'timestamp': datetime.now(),
'product': name,
'price': price,
'url': url
})

time.sleep(2) # Be polite

# Save results
df = pd.DataFrame(results)
df.to_csv('price_history.csv', mode='a', header=False, index=False)

return results

# Usage
# monitor = PriceMonitor()
# products = {
# 'Laptop': 'https://example.com/laptop',
# 'Phone': 'https://example.com/phone'
# }
# monitor.monitor_products(products)

Summary

✅ BeautifulSoup parses HTML and XML documents
✅ Use requests to fetch webpage content
✅ Extract data using tags, classes, IDs, and CSS selectors
✅ Selenium handles JavaScript-heavy sites
✅ Always respect robots.txt and rate limits
✅ Implement error handling and retries


Next Steps

In Module 28, you'll learn:

  • Flask web framework basics
  • Creating routes and templates
  • Building simple web applications
  • Handling forms and user input

Practice Exercises

  1. Scrape all article titles from a news website's homepage
  2. Extract product information (name, price, rating) from an e-commerce site
  3. Build a job listing aggregator from multiple job boards
  4. Create a weather data scraper that saves daily forecasts
  5. Develop a price comparison tool for multiple online stores
Challenge

Create a comprehensive web scraping framework that:

  • Supports multiple websites with different structures
  • Implements polite crawling (respects robots.txt, rate limits)
  • Handles pagination and infinite scroll
  • Detects and adapts to website structure changes
  • Stores data in a database with deduplication
  • Sends alerts for significant changes (e.g., price drops)
  • Includes logging and error recovery