Python 3.12 Released
The latest version of Python includes new features...
Web scraping is the process of extracting data from websites. BeautifulSoup is a Python library that makes it easy to parse HTML and XML documents. Combined with requests library, you can download web pages and extract specific information like text, links, images, or tables. Important: Always respect websites' robots.txt files and terms of service.
# Web Scraping with BeautifulSoup
import requests
from bs4 import BeautifulSoup
import csv
import time
print("WEB SCRAPING BASICS WITH BEAUTIFULSOUP")
print("=" * 60)
# Note: For actual web scraping, you may need to install:
# pip install requests beautifulsoup4
# Example 1: Basic web page fetching
print("\n1. BASIC WEB PAGE FETCHING")
print("-" * 30)
# Let's use a sample HTML for demonstration
sample_html = '''
Sample Book Store
Welcome to Our Book Store
'''
# Parse the HTML
soup = BeautifulSoup(sample_html, 'html.parser')
print("Page Title:", soup.title.text)
print("First h1 tag:", soup.h1.text)
# Find all book titles
print("\nBook Titles:")
book_titles = soup.find_all('h2', class_='title')
for i, title in enumerate(book_titles, 1):
print(f"{i}. {title.text}")
# Example 2: Extracting specific data
print("\n\n2. EXTRACTING SPECIFIC DATA")
print("-" * 30)
# Find all books
def extract_book_data(html_content):
"""Extract book information from HTML"""
soup = BeautifulSoup(html_content, 'html.parser')
books = []
# Find all book divs
book_divs = soup.find_all('div', class_='book')
for book in book_divs:
# Extract data with error handling
title = book.find('h2', class_='title').text if book.find('h2', class_='title') else 'N/A'
author = book.find('p', class_='author').text if book.find('p', class_='author') else 'N/A'
price = book.find('span', class_='price').text if book.find('span', class_='price') else 'N/A'
link = book.find('a')['href'] if book.find('a') else 'N/A'
books.append({
'title': title,
'author': author,
'price': price,
'link': link
})
return books
books_data = extract_book_data(sample_html)
print("Extracted Book Data:")
print("=" * 50)
for i, book in enumerate(books_data, 1):
print(f"\nBook {i}:")
print(f" Title: {book['title']}")
print(f" Author: {book['author']}")
print(f" Price: {book['price']}")
print(f" Link: {book['link']}")
# Example 3: Working with real website (with caution)
print("\n\n3. WORKING WITH REAL WEBSITES")
print("-" * 30)
print("Note: Always check robots.txt and terms of service!")
print("Let's use a public demo site instead of a real one.")
# Using a demo site for practice
demo_html = '''
Product
Price
Stock
Laptop
$999
In Stock
Mouse
$25
Out of Stock
Keyboard
$79
In Stock
'''
# Parse table data
table_soup = BeautifulSoup(demo_html, 'html.parser')
table = table_soup.find('table', id='products')
print("\nProduct Table:")
print("-" * 40)
if table:
rows = table.find_all('tr')
for i, row in enumerate(rows):
cols = row.find_all(['th', 'td'])
row_data = [col.text.strip() for col in cols]
print(f"{row_data[0]:15} {row_data[1]:10} {row_data[2]}")
else:
print("Table not found")
# Example 4: Saving scraped data to CSV
print("\n\n4. SAVING DATA TO CSV")
print("-" * 30)
# Extract product data
products = []
if table:
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cols = row.find_all('td')
if len(cols) >= 3:
product = {
'name': cols[0].text.strip(),
'price': cols[1].text.strip(),
'stock': cols[2].text.strip()
}
products.append(product)
# Save to CSV
filename = 'products.csv'
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['name', 'price', 'stock']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for product in products:
writer.writerow(product)
print(f"Saved {len(products)} products to {filename}")
print("\nCSV Content:")
with open(filename, 'r') as f:
print(f.read())
# Example 5: Advanced scraping techniques
print("\n\n5. ADVANCED SCRAPING TECHNIQUES")
print("-" * 30)
# Complex HTML with nested structure
complex_html = '''
Python 3.12 Released
The latest version of Python includes new features...
AI Breakthrough
Researchers announce new AI model...
'''
# Parse complex structure
news_soup = BeautifulSoup(complex_html, 'html.parser')
print("News Articles:")
print("=" * 40)
articles = news_soup.find_all('article', class_='news-item')
for i, article in enumerate(articles, 1):
# Extract with CSS selectors
title = article.find('h2').text.strip()
link = article.find('a')['href']
date = article.find('span', class_='date').text if article.find('span', class_='date') else 'N/A'
category = article.find('span', class_='category').text if article.find('span', class_='category') else 'N/A'
summary = article.find('p', class_='summary').text.strip() if article.find('p', class_='summary') else 'N/A'
# Extract tags
tags = [tag.text for tag in article.find_all('li')]
print(f"\nArticle {i}:")
print(f" Title: {title}")
print(f" Link: {link}")
print(f" Date: {date}")
print(f" Category: {category}")
print(f" Summary: {summary[:50]}...")
print(f" Tags: {', '.join(tags)}")
# Check if featured
if 'featured' in article.get('class', []):
print(" ★ Featured Article")
# Example 6: Error handling in web scraping
print("\n\n6. ERROR HANDLING IN WEB SCRAPING")
print("-" * 30)
def safe_scrape(url):
"""Safely scrape a webpage with error handling"""
try:
# In real scraping, you would use:
# response = requests.get(url, headers={'User-Agent': 'Your Bot'})
# response.raise_for_status() # Check for HTTP errors
# For demo, simulate different scenarios
scenarios = [
"success",
"404_error",
"timeout",
"parse_error"
]
import random
scenario = random.choice(scenarios)
if scenario == "success":
print(f"Successfully fetched {url}")
# Parse would happen here
return {"status": "success", "data": "Sample data"}
elif scenario == "404_error":
print(f"Error: Page not found (404) for {url}")
return {"status": "error", "message": "Page not found"}
elif scenario == "timeout":
print(f"Error: Request timed out for {url}")
return {"status": "error", "message": "Request timeout"}
elif scenario == "parse_error":
print(f"Error: Could not parse HTML from {url}")
return {"status": "error", "message": "Parsing failed"}
except Exception as e:
print(f"Unexpected error: {e}")
return {"status": "error", "message": str(e)}
# Test error handling
print("Testing error handling scenarios:")
for i in range(3):
result = safe_scrape(f"https://example.com/page{i}")
print(f"Result: {result['status']}")
if result['status'] == "error":
print(f" Reason: {result['message']}")
print()
# Example 7: Web scraping etiquette
print("\n\n7. WEB SCRAPING ETIQUETTE")
print("-" * 30)
print("Important rules for ethical web scraping:")
print("1. Check robots.txt (e.g., https://example.com/robots.txt)")
print("2. Respect rate limits (add delays between requests)")
print("3. Identify your bot with User-Agent header")
print("4. Don't overload servers")
print("5. Check website's terms of service")
print("6. Only scrape publicly available data")
print("7. Consider using official APIs if available")
# Example of polite scraping with delay
def polite_scraper(urls):
"""Scrape multiple URLs with delays"""
scraped_data = []
for i, url in enumerate(urls):
print(f"Scraping {url}...")
# Simulate request
# response = requests.get(url, headers={
# 'User-Agent': 'MyScraperBot/1.0 (educational-purpose)'
# })
# Add delay to be polite (2-5 seconds between requests)
if i > 0:
delay = 3 # seconds
print(f"Waiting {delay} seconds to be polite...")
# time.sleep(delay)
# Process response...
scraped_data.append({"url": url, "data": f"Data from {url}"})
return scraped_data
# Example 8: Complete web scraping project
print("\n\n8. COMPLETE WEB SCRAPING PROJECT")
print("-" * 30)
class BookScraper:
"""A simple book scraper for demonstration"""
def __init__(self):
self.books = []
def scrape_sample_data(self):
"""Scrape from sample HTML (in real life, this would fetch from URL)"""
# Sample data representing a bookstore
html_content = '''
Python Cookbook
$49.99
Fluent Python
$44.99
'''
soup = BeautifulSoup(html_content, 'html.parser')
book_divs = soup.find_all('div', class_='book')
for book_div in book_divs:
book = {
'title': book_div.find('h3').text if book_div.find('h3') else 'N/A',
'author': book_div.find('p', class_='author').text if book_div.find('p', class_='author') else 'N/A',
'price': book_div.find('p', class_='price').text if book_div.find('p', class_='price') else 'N/A',
'rating': book_div.find('p', class_='rating').text if book_div.find('p', class_='rating') else 'N/A'
}
self.books.append(book)
return len(self.books)
def display_books(self):
"""Display all scraped books"""
print(f"\nFound {len(self.books)} books:")
print("=" * 50)
for i, book in enumerate(self.books, 1):
print(f"\nBook {i}:")
print(f" Title: {book['title']}")
print(f" Author: {book['author']}")
print(f" Price: {book['price']}")
print(f" Rating: {book['rating']}")
def save_to_csv(self, filename):
"""Save books to CSV file"""
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'author', 'price', 'rating']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.books)
print(f"\nSaved {len(self.books)} books to {filename}")
def filter_by_price(self, max_price):
"""Filter books by maximum price"""
# Extract numeric price
affordable_books = []
for book in self.books:
# Convert "$49.99" to 49.99
try:
price_str = book['price'].replace('$', '').strip()
price = float(price_str)
if price <= max_price:
affordable_books.append(book)
except (ValueError, AttributeError):
continue
return affordable_books
# Run the scraper
print("Running Book Scraper...")
scraper = BookScraper()
count = scraper.scrape_sample_data()
print(f"Scraped {count} books")
scraper.display_books()
# Save to CSV
scraper.save_to_csv('books.csv')
# Filter books
print("\nBooks under $45:")
affordable = scraper.filter_by_price(45)
for book in affordable:
print(f"- {book['title']}: {book['price']}")