Module 17 - Regular Expressions
Regular expressions (regex) are powerful pattern matching tools for searching, validating, and manipulating text. Python's re module provides comprehensive regex support.
1. Introduction to Regular Expressions
What is Regex?
A regular expression is a sequence of characters that defines a search pattern.
import re
# Simple pattern matching
text = "The phone number is 123-456-7890"
pattern = r"\d{3}-\d{3}-\d{4}"
match = re.search(pattern, text)
if match:
print(match.group()) # 123-456-7890
Always use raw strings (r"pattern") for regex patterns to avoid escaping backslashes.
2. Basic Patterns
2.1 Literal Characters
import re
text = "Python is awesome"
# Match exact string
if re.search(r"Python", text):
print("Found 'Python'")
# Case-insensitive matching
if re.search(r"python", text, re.IGNORECASE):
print("Found 'python' (case-insensitive)")
2.2 Character Classes
| Pattern | Matches | Example |
|---|---|---|
. | Any character except newline | a.c matches "abc", "a9c" |
\d | Any digit (0-9) | \d\d matches "42" |
\D | Any non-digit | \D+ matches "abc" |
\w | Word character (a-z, A-Z, 0-9, _) | \w+ matches "hello_123" |
\W | Non-word character | \W matches "@", "!" |
\s | Whitespace (space, tab, newline) | \s+ matches " " |
\S | Non-whitespace | \S+ matches "hello" |
# Examples
text = "My email is john@example.com and phone is 555-1234"
# Find digits
digits = re.findall(r'\d+', text)
print(digits) # ['555', '1234']
# Find words
words = re.findall(r'\w+', text)
print(words) # ['My', 'email', 'is', 'john', 'example', 'com', ...]
2.3 Custom Character Sets
# [abc] - matches a, b, or c
pattern = r'[aeiou]'
vowels = re.findall(pattern, "hello world")
print(vowels) # ['e', 'o', 'o']
# [a-z] - range of characters
pattern = r'[a-z]+'
lowercase = re.findall(pattern, "Hello World 123")
print(lowercase) # ['ello', 'orld']
# [^abc] - NOT a, b, or c
pattern = r'[^aeiou]+'
consonants = re.findall(pattern, "hello world")
print(consonants) # ['h', 'll', ' w', 'rld']
3. Quantifiers
Specify how many times a pattern should match.
| Quantifier | Meaning | Example |
|---|---|---|
* | 0 or more | a* matches "", "a", "aaa" |
+ | 1 or more | a+ matches "a", "aaa" (not "") |
? | 0 or 1 (optional) | colou?r matches "color", "colour" |
{n} | Exactly n times | \d{3} matches "123" |
{n,} | n or more times | \d{3,} matches "123", "1234" |
{n,m} | Between n and m times | \d{2,4} matches "12", "123", "1234" |
import re
# Match phone numbers
phone = "Call 123-456-7890 or 555-0000"
pattern = r'\d{3}-\d{3}-\d{4}'
phones = re.findall(pattern, phone)
print(phones) # ['123-456-7890']
# Match email
email = "Contact: john.doe@example.com"
pattern = r'\w+@\w+\.\w+'
emails = re.findall(pattern, email)
print(emails) # ['john.doe@example.com'] (simplified)
# Optional characters
text = "color colour"
pattern = r'colou?r'
matches = re.findall(pattern, text)
print(matches) # ['color', 'colour']
4. Anchors and Boundaries
| Anchor | Matches | Example |
|---|---|---|
^ | Start of string | ^Hello matches "Hello world" |
$ | End of string | world$ matches "Hello world" |
\b | Word boundary | \bcat\b matches "cat" not "catalog" |
\B | Non-word boundary | \Bcat\B matches "concatenate" |
# Start of string
text = "Python is great"
if re.match(r'^Python', text):
print("Starts with Python")
# End of string
if re.search(r'great$', text):
print("Ends with great")
# Word boundaries
text = "The cat in the catalog"
pattern = r'\bcat\b'
matches = re.findall(pattern, text)
print(matches) # ['cat'] (not 'cat' from 'catalog')
5. Groups and Capturing
5.1 Capturing Groups
Use parentheses () to capture parts of a match.
# Extract parts of a phone number
phone = "Contact: 123-456-7890"
pattern = r'(\d{3})-(\d{3})-(\d{4})'
match = re.search(pattern, phone)
if match:
print(match.group(0)) # Full match: 123-456-7890
print(match.group(1)) # First group: 123
print(match.group(2)) # Second group: 456
print(match.group(3)) # Third group: 7890
print(match.groups()) # All groups: ('123', '456', '7890')
5.2 Named Groups
# Named capturing groups
pattern = r'(?P<area>\d{3})-(?P<exchange>\d{3})-(?P<number>\d{4})'
match = re.search(pattern, "Call 555-123-4567")
if match:
print(match.group('area')) # 555
print(match.group('exchange')) # 123
print(match.group('number')) # 4567
print(match.groupdict()) # {'area': '555', ...}
5.3 Non-Capturing Groups
# Non-capturing group (?:...)
pattern = r'(?:https?|ftp)://\w+'
url = "Visit https://example.com"
match = re.search(pattern, url)
print(match.group(0)) # https://example.com
# No group(1) - non-capturing
6. The re Module Functions
6.1 re.search() - Find First Match
text = "The price is $50 and $100"
match = re.search(r'\$\d+', text)
if match:
print(match.group()) # $50 (first match only)
6.2 re.match() - Match at Start
text = "Python is great"
match = re.match(r'Python', text)
if match:
print("Matched at start")
# This fails (match only checks start)
match = re.match(r'great', text)
print(match) # None
6.3 re.findall() - Find All Matches
text = "Prices: $10, $20, $30"
prices = re.findall(r'\$\d+', text)
print(prices) # ['$10', '$20', '$30']
# With groups - returns tuples
text = "John:30, Jane:25, Bob:35"
data = re.findall(r'(\w+):(\d+)', text)
print(data) # [('John', '30'), ('Jane', '25'), ('Bob', '35')]
6.4 re.finditer() - Iterator of Matches
text = "Emails: john@example.com, jane@test.org"
pattern = r'\w+@\w+\.\w+'
for match in re.finditer(pattern, text):
print(f"Found: {match.group()} at {match.span()}")
# Found: john@example.com at (8, 24)
# Found: jane@test.org at (26, 39)
6.5 re.sub() - Search and Replace
# Replace patterns
text = "My phone is 123-456-7890"
result = re.sub(r'\d', 'X', text)
print(result) # My phone is XXX-XXX-XXXX
# Using groups in replacement
text = "John Doe"
result = re.sub(r'(\w+) (\w+)', r'\2, \1', text)
print(result) # Doe, John
# Function as replacement
def mask_digits(match):
return '*' * len(match.group())
text = "Card: 1234-5678-9012-3456"
result = re.sub(r'\d+', mask_digits, text)
print(result) # Card: ****-****-****-****
6.6 re.split() - Split by Pattern
# Split on multiple delimiters
text = "apple,banana;cherry:date"
fruits = re.split(r'[,;:]', text)
print(fruits) # ['apple', 'banana', 'cherry', 'date']
# Split with capturing groups
text = "a1b2c3d"
parts = re.split(r'(\d)', text)
print(parts) # ['a', '1', 'b', '2', 'c', '3', 'd']
7. Common Patterns
7.1 Email Validation
# Basic email pattern
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
emails = [
"john@example.com", # Valid
"jane.doe@test.co.uk", # Valid
"invalid.email", # Invalid
"@example.com" # Invalid
]
for email in emails:
if re.match(pattern, email):
print(f"✓ {email}")
else:
print(f"✗ {email}")
7.2 Phone Number Validation
# US phone number patterns
patterns = [
r'^\d{3}-\d{3}-\d{4}$', # 123-456-7890
r'^\(\d{3}\) \d{3}-\d{4}$', # (123) 456-7890
r'^\d{10}$', # 1234567890
]
phone = "(555) 123-4567"
for pattern in patterns:
if re.match(pattern, phone):
print(f"Valid: {phone}")
break
7.3 URL Validation
# URL pattern
pattern = r'^(https?|ftp)://[^\s/$.?#].[^\s]*$'
urls = [
"https://example.com",
"http://test.org/path?query=1",
"ftp://files.example.com",
"invalid-url"
]
for url in urls:
if re.match(pattern, url):
print(f"✓ {url}")
else:
print(f"✗ {url}")
7.4 Extract Data
# Extract dates (MM/DD/YYYY)
text = "Events on 12/25/2023 and 01/01/2024"
dates = re.findall(r'\d{2}/\d{2}/\d{4}', text)
print(dates) # ['12/25/2023', '01/01/2024']
# Extract hashtags
text = "Check out #Python #Regex #Tutorial"
hashtags = re.findall(r'#\w+', text)
print(hashtags) # ['#Python', '#Regex', '#Tutorial']
# Extract prices
text = "Items cost $19.99, $5.50, and $100"
prices = re.findall(r'\$\d+\.?\d*', text)
print(prices) # ['$19.99', '$5.50', '$100']
8. Flags (Modifiers)
import re
text = "Python is AWESOME"
# Case-insensitive
result = re.findall(r'python', text, re.IGNORECASE)
print(result) # ['Python']
# Multiline mode
text = """
Line 1
Line 2
Line 3
"""
lines = re.findall(r'^Line \d$', text, re.MULTILINE)
print(lines) # ['Line 1', 'Line 2', 'Line 3']
# Dot matches newline
text = "Hello\nWorld"
result = re.search(r'Hello.World', text, re.DOTALL)
print(result.group()) # Hello\nWorld
# Verbose mode (allows comments)
pattern = re.compile(r'''
\d{3} # Area code
- # Separator
\d{3} # Exchange
- # Separator
\d{4} # Number
''', re.VERBOSE)
9. Practical Examples
9.1 Password Validation
def validate_password(password):
"""
Password must:
- Be 8-20 characters
- Contain uppercase and lowercase
- Contain digit
- Contain special character
"""
if len(password) < 8 or len(password) > 20:
return False
if not re.search(r'[A-Z]', password):
return False
if not re.search(r'[a-z]', password):
return False
if not re.search(r'\d', password):
return False
if not re.search(r'[!@#$%^&*(),.?":{}|<>]', password):
return False
return True
passwords = ["Weak", "Strong123!", "NoDigits!", "noUppercase1!"]
for pwd in passwords:
print(f"{pwd}: {'✓' if validate_password(pwd) else '✗'}")
9.2 Log Parsing
# Parse log entries
log = """
2024-01-15 10:30:45 ERROR User login failed
2024-01-15 10:31:12 INFO User logged in successfully
2024-01-15 10:32:00 WARNING Low disk space
"""
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+)'
for line in log.strip().split('\n'):
match = re.match(pattern, line)
if match:
date, time, level, message = match.groups()
print(f"[{level}] {date} {time}: {message}")
9.3 Data Cleaning
# Clean and normalize text
def clean_text(text):
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters
text = re.sub(r'[^\w\s]', '', text)
return text.strip()
html = "<p>Hello <b>World</b>!!! Test.</p>"
cleaned = clean_text(html)
print(cleaned) # "Hello World Test"
Summary
✅ Regex provides powerful pattern matching capabilities
✅ Use raw strings (r"") for regex patterns
✅ re.search(), re.findall(), re.sub() are most common
✅ Capturing groups extract specific parts of matches
✅ Common patterns: email, phone, URL validation
✅ Test regex patterns thoroughly with edge cases
Next Steps
In Module 18, you'll learn:
- Working with dates and times
datetimemodule- Timezones and
timedelta - Parsing and formatting dates
Practice Exercises
- Write a regex to validate IPv4 addresses (e.g., 192.168.1.1)
- Extract all Twitter mentions (@username) from text
- Create a function to mask credit card numbers (show last 4 digits)
- Parse CSV data using regex (handle quoted fields)
- Build a simple text-based search and replace tool
Create a comprehensive email validator that:
- Validates proper email format
- Extracts username and domain separately
- Checks for common email providers (gmail, yahoo, etc.)
- Returns validation errors with specific messages