Web Scraping Basics with BeautifulSoup

Web scraping is the process of extracting data from websites. BeautifulSoup is a Python library that makes it easy to parse HTML and XML documents. Combined with requests library, you can download web pages and extract specific information like text, links, images, or tables. Important: Always respect websites' robots.txt files and terms of service.

# Web Scraping with BeautifulSoup import requests from bs4 import BeautifulSoup import csv import time print("WEB SCRAPING BASICS WITH BEAUTIFULSOUP") print("=" * 60) # Note: For actual web scraping, you may need to install: # pip install requests beautifulsoup4 # Example 1: Basic web page fetching print("\n1. BASIC WEB PAGE FETCHING") print("-" * 30) # Let's use a sample HTML for demonstration sample_html = ''' Sample Book Store Welcome to Our Book Store Python Programming John Doe $29.99 Details Data Science Basics Jane Smith $34.99 Details Web Development Bob Johnson $24.99 Details Contact: info@bookstore.com ''' # Parse the HTML soup = BeautifulSoup(sample_html, 'html.parser') print("Page Title:", soup.title.text) print("First h1 tag:", soup.h1.text) # Find all book titles print("\nBook Titles:") book_titles = soup.find_all('h2', class_='title') for i, title in enumerate(book_titles, 1): print(f"{i}. {title.text}") # Example 2: Extracting specific data print("\n\n2. EXTRACTING SPECIFIC DATA") print("-" * 30) # Find all books def extract_book_data(html_content): """Extract book information from HTML""" soup = BeautifulSoup(html_content, 'html.parser') books = [] # Find all book divs book_divs = soup.find_all('div', class_='book') for book in book_divs: # Extract data with error handling title = book.find('h2', class_='title').text if book.find('h2', class_='title') else 'N/A' author = book.find('p', class_='author').text if book.find('p', class_='author') else 'N/A' price = book.find('span', class_='price').text if book.find('span', class_='price') else 'N/A' link = book.find('a')['href'] if book.find('a') else 'N/A' books.append({ 'title': title, 'author': author, 'price': price, 'link': link }) return books books_data = extract_book_data(sample_html) print("Extracted Book Data:") print("=" * 50) for i, book in enumerate(books_data, 1): print(f"\nBook {i}:") print(f" Title: {book['title']}") print(f" Author: {book['author']}") print(f" Price: {book['price']}") print(f" Link: {book['link']}") # Example 3: Working with real website (with caution) print("\n\n3. WORKING WITH REAL WEBSITES") print("-" * 30) print("Note: Always check robots.txt and terms of service!") print("Let's use a public demo site instead of a real one.") # Using a demo site for practice demo_html = ''' Product Price Stock Laptop $999 In Stock Mouse $25 Out of Stock Keyboard $79 In Stock ''' # Parse table data table_soup = BeautifulSoup(demo_html, 'html.parser') table = table_soup.find('table', id='products') print("\nProduct Table:") print("-" * 40) if table: rows = table.find_all('tr') for i, row in enumerate(rows): cols = row.find_all(['th', 'td']) row_data = [col.text.strip() for col in cols] print(f"{row_data[0]:15} {row_data[1]:10} {row_data[2]}") else: print("Table not found") # Example 4: Saving scraped data to CSV print("\n\n4. SAVING DATA TO CSV") print("-" * 30) # Extract product data products = [] if table: rows = table.find_all('tr')[1:] # Skip header row for row in rows: cols = row.find_all('td') if len(cols) >= 3: product = { 'name': cols[0].text.strip(), 'price': cols[1].text.strip(), 'stock': cols[2].text.strip() } products.append(product) # Save to CSV filename = 'products.csv' with open(filename, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['name', 'price', 'stock'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for product in products: writer.writerow(product) print(f"Saved {len(products)} products to {filename}") print("\nCSV Content:") with open(filename, 'r') as f: print(f.read()) # Example 5: Advanced scraping techniques print("\n\n5. ADVANCED SCRAPING TECHNIQUES") print("-" * 30) # Complex HTML with nested structure complex_html = ''' Python 3.12 Released 2024-03-15 Technology 1,234 views The latest version of Python includes new features... Python Programming Update AI Breakthrough 2024-03-14 Science 2,345 views Researchers announce new AI model... AI Research Machine Learning ''' # Parse complex structure news_soup = BeautifulSoup(complex_html, 'html.parser') print("News Articles:") print("=" * 40) articles = news_soup.find_all('article', class_='news-item') for i, article in enumerate(articles, 1): # Extract with CSS selectors title = article.find('h2').text.strip() link = article.find('a')['href'] date = article.find('span', class_='date').text if article.find('span', class_='date') else 'N/A' category = article.find('span', class_='category').text if article.find('span', class_='category') else 'N/A' summary = article.find('p', class_='summary').text.strip() if article.find('p', class_='summary') else 'N/A' # Extract tags tags = [tag.text for tag in article.find_all('li')] print(f"\nArticle {i}:") print(f" Title: {title}") print(f" Link: {link}") print(f" Date: {date}") print(f" Category: {category}") print(f" Summary: {summary[:50]}...") print(f" Tags: {', '.join(tags)}") # Check if featured if 'featured' in article.get('class', []): print(" ★ Featured Article") # Example 6: Error handling in web scraping print("\n\n6. ERROR HANDLING IN WEB SCRAPING") print("-" * 30) def safe_scrape(url): """Safely scrape a webpage with error handling""" try: # In real scraping, you would use: # response = requests.get(url, headers={'User-Agent': 'Your Bot'}) # response.raise_for_status() # Check for HTTP errors # For demo, simulate different scenarios scenarios = [ "success", "404_error", "timeout", "parse_error" ] import random scenario = random.choice(scenarios) if scenario == "success": print(f"Successfully fetched {url}") # Parse would happen here return {"status": "success", "data": "Sample data"} elif scenario == "404_error": print(f"Error: Page not found (404) for {url}") return {"status": "error", "message": "Page not found"} elif scenario == "timeout": print(f"Error: Request timed out for {url}") return {"status": "error", "message": "Request timeout"} elif scenario == "parse_error": print(f"Error: Could not parse HTML from {url}") return {"status": "error", "message": "Parsing failed"} except Exception as e: print(f"Unexpected error: {e}") return {"status": "error", "message": str(e)} # Test error handling print("Testing error handling scenarios:") for i in range(3): result = safe_scrape(f"https://example.com/page{i}") print(f"Result: {result['status']}") if result['status'] == "error": print(f" Reason: {result['message']}") print() # Example 7: Web scraping etiquette print("\n\n7. WEB SCRAPING ETIQUETTE") print("-" * 30) print("Important rules for ethical web scraping:") print("1. Check robots.txt (e.g., https://example.com/robots.txt)") print("2. Respect rate limits (add delays between requests)") print("3. Identify your bot with User-Agent header") print("4. Don't overload servers") print("5. Check website's terms of service") print("6. Only scrape publicly available data") print("7. Consider using official APIs if available") # Example of polite scraping with delay def polite_scraper(urls): """Scrape multiple URLs with delays""" scraped_data = [] for i, url in enumerate(urls): print(f"Scraping {url}...") # Simulate request # response = requests.get(url, headers={ # 'User-Agent': 'MyScraperBot/1.0 (educational-purpose)' # }) # Add delay to be polite (2-5 seconds between requests) if i > 0: delay = 3 # seconds print(f"Waiting {delay} seconds to be polite...") # time.sleep(delay) # Process response... scraped_data.append({"url": url, "data": f"Data from {url}"}) return scraped_data # Example 8: Complete web scraping project print("\n\n8. COMPLETE WEB SCRAPING PROJECT") print("-" * 30) class BookScraper: """A simple book scraper for demonstration""" def __init__(self): self.books = [] def scrape_sample_data(self): """Scrape from sample HTML (in real life, this would fetch from URL)""" # Sample data representing a bookstore html_content = ''' Python Cookbook David Beazley $49.99 ★★★★☆ (4.2/5) Fluent Python Luciano Ramalho $44.99 ★★★★★ (4.7/5) ''' soup = BeautifulSoup(html_content, 'html.parser') book_divs = soup.find_all('div', class_='book') for book_div in book_divs: book = { 'title': book_div.find('h3').text if book_div.find('h3') else 'N/A', 'author': book_div.find('p', class_='author').text if book_div.find('p', class_='author') else 'N/A', 'price': book_div.find('p', class_='price').text if book_div.find('p', class_='price') else 'N/A', 'rating': book_div.find('p', class_='rating').text if book_div.find('p', class_='rating') else 'N/A' } self.books.append(book) return len(self.books) def display_books(self): """Display all scraped books""" print(f"\nFound {len(self.books)} books:") print("=" * 50) for i, book in enumerate(self.books, 1): print(f"\nBook {i}:") print(f" Title: {book['title']}") print(f" Author: {book['author']}") print(f" Price: {book['price']}") print(f" Rating: {book['rating']}") def save_to_csv(self, filename): """Save books to CSV file""" with open(filename, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['title', 'author', 'price', 'rating'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.books) print(f"\nSaved {len(self.books)} books to {filename}") def filter_by_price(self, max_price): """Filter books by maximum price""" # Extract numeric price affordable_books = [] for book in self.books: # Convert "$49.99" to 49.99 try: price_str = book['price'].replace('$', '').strip() price = float(price_str) if price <= max_price: affordable_books.append(book) except (ValueError, AttributeError): continue return affordable_books # Run the scraper print("Running Book Scraper...") scraper = BookScraper() count = scraper.scrape_sample_data() print(f"Scraped {count} books") scraper.display_books() # Save to CSV scraper.save_to_csv('books.csv') # Filter books print("\nBooks under $45:") affordable = scraper.filter_by_price(45) for book in affordable: print(f"- {book['title']}: {book['price']}")