Parsing HTML and XML Data in Python with re, BeautifulSoup, and lxml

Regular Expressions with the re Module

The re module provides pattern matching operations for string processing, often used for data etxraction and validation.

import re

# Extract all numeric sequences from a string
number_list = re.findall(r'\d+', 'ID: 12345, Code: 67890')
print(number_list)

# Use an iterator for memory-efficient matching
num_iter = re.finditer(r'\d+', 'ID: 12345, Code: 67890')
for match in num_iter:
    print(match.group())

# Search for the first occurrence
first_match = re.search(r'\d+', 'ID: 12345, Code: 67890')
if first_match:
    print(first_match.group())

# Match from the beginning of the string
start_match = re.match(r'\d+', '12345, Code: 67890')
if start_match:
    print(start_match.group())

# Extract data using named groups
html_content = """
<div class='item'><span id='101'>Product A</span></div>
<div class='item'><span id='102'>Product B</span></div>
<div class='item'><span id='103'>Product C</span></div>
"""
pattern = re.compile(r"<div class='.*?'><span id='(?P<product_id>\d+)'>(?P<name>.*?)</span></div>", re.S)
matches = pattern.finditer(html_content)
for m in matches:
    print(f"Name: {m.group('name')}, ID: {m.group('product_id')}")

HTML Parsing with BeautifulSoup

BeautifulSoup constructs a parse tree from HTML/XML documents, enabling easy data navigation and extraction.

from bs4 import BeautifulSoup
import re

html_doc = """
<html>
<head><title>Sample Page</title></head>
<body>
<h1>Main Heading</h1>
<p class="intro">First paragraph with a <a href="/link1" class="internal">link</a>.</p>
<p class="content">Second paragraph with another <a href="/link2" id="special">link</a>.</p>
<ul>
<li>Item one</li>
<li>Item two</li>
</ul>
</body>
</html>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# Access tag properties
print(soup.title)
print(soup.title.string)
print(soup.h1.name)
print(soup.p['class'])

# Find elements by various criteria
all_links = soup.find_all('a')
for link in all_links:
    print(link.get('href'))

# Find by attribute
special_link = soup.find_all(id='special')
print(special_link)

# Use CSS selectors
link_by_class = soup.select('.internal')
link_by_id = soup.select('#special')
nested_links = soup.select('p a')

XML and HTML Parsing with lxml and XPath

XPath provides a syntax for navigating through eelments and attributes in an XML/HTML document tree.

from lxml import etree

xml_data = """
<catalog>
    <product>
        <sku>P001</sku>
        <name>Widget</name>
        <category>
            <primary>Tools</primary>
            <secondary>Hardware</secondary>
        </category>
        <suppliers>
            <vendor id="v1">Supplier A</vendor>
            <vendor id="v2">Supplier B</vendor>
        </suppliers>
    </product>
</catalog>
"""

# Parse XML string
tree = etree.XML(xml_data)

# Navigate hierarchy
product_name = tree.xpath('/catalog/product/name/text()')
print(product_name)

# Get all vendor text values
all_vendors = tree.xpath('//vendor/text()')
print(all_vendors)

# Access attribute values
vendor_ids = tree.xpath('//vendor/@id')
print(vendor_ids)

# Filter by attribute value
specific_vendor = tree.xpath("//vendor[@id='v2']/text()")
print(specific_vendor)

# Parse from an HTML file
html_tree = etree.parse('example.html', etree.HTMLParser())

# Use positional indexing
first_item = html_tree.xpath('/html/body/ul/li[1]/text()')

# Iterate over result sets
list_items = html_tree.xpath('/html/body/ul/li')
for item in list_items:
    text = item.xpath('./text()')
    print(text)

Tags: python web scraping regular expressions HTML Parsing beautifulsoup

Posted on Thu, 14 May 2026 21:47:22 +0000 by jjfletch