Regular Expressions with the re Module
The re module provides pattern matching operations for string processing, often used for data etxraction and validation.
import re
# Extract all numeric sequences from a string
number_list = re.findall(r'\d+', 'ID: 12345, Code: 67890')
print(number_list)
# Use an iterator for memory-efficient matching
num_iter = re.finditer(r'\d+', 'ID: 12345, Code: 67890')
for match in num_iter:
print(match.group())
# Search for the first occurrence
first_match = re.search(r'\d+', 'ID: 12345, Code: 67890')
if first_match:
print(first_match.group())
# Match from the beginning of the string
start_match = re.match(r'\d+', '12345, Code: 67890')
if start_match:
print(start_match.group())
# Extract data using named groups
html_content = """
<div class='item'><span id='101'>Product A</span></div>
<div class='item'><span id='102'>Product B</span></div>
<div class='item'><span id='103'>Product C</span></div>
"""
pattern = re.compile(r"<div class='.*?'><span id='(?P<product_id>\d+)'>(?P<name>.*?)</span></div>", re.S)
matches = pattern.finditer(html_content)
for m in matches:
print(f"Name: {m.group('name')}, ID: {m.group('product_id')}")
HTML Parsing with BeautifulSoup
BeautifulSoup constructs a parse tree from HTML/XML documents, enabling easy data navigation and extraction.
from bs4 import BeautifulSoup
import re
html_doc = """
<html>
<head><title>Sample Page</title></head>
<body>
<h1>Main Heading</h1>
<p class="intro">First paragraph with a <a href="/link1" class="internal">link</a>.</p>
<p class="content">Second paragraph with another <a href="/link2" id="special">link</a>.</p>
<ul>
<li>Item one</li>
<li>Item two</li>
</ul>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# Access tag properties
print(soup.title)
print(soup.title.string)
print(soup.h1.name)
print(soup.p['class'])
# Find elements by various criteria
all_links = soup.find_all('a')
for link in all_links:
print(link.get('href'))
# Find by attribute
special_link = soup.find_all(id='special')
print(special_link)
# Use CSS selectors
link_by_class = soup.select('.internal')
link_by_id = soup.select('#special')
nested_links = soup.select('p a')
XML and HTML Parsing with lxml and XPath
XPath provides a syntax for navigating through eelments and attributes in an XML/HTML document tree.
from lxml import etree
xml_data = """
<catalog>
<product>
<sku>P001</sku>
<name>Widget</name>
<category>
<primary>Tools</primary>
<secondary>Hardware</secondary>
</category>
<suppliers>
<vendor id="v1">Supplier A</vendor>
<vendor id="v2">Supplier B</vendor>
</suppliers>
</product>
</catalog>
"""
# Parse XML string
tree = etree.XML(xml_data)
# Navigate hierarchy
product_name = tree.xpath('/catalog/product/name/text()')
print(product_name)
# Get all vendor text values
all_vendors = tree.xpath('//vendor/text()')
print(all_vendors)
# Access attribute values
vendor_ids = tree.xpath('//vendor/@id')
print(vendor_ids)
# Filter by attribute value
specific_vendor = tree.xpath("//vendor[@id='v2']/text()")
print(specific_vendor)
# Parse from an HTML file
html_tree = etree.parse('example.html', etree.HTMLParser())
# Use positional indexing
first_item = html_tree.xpath('/html/body/ul/li[1]/text()')
# Iterate over result sets
list_items = html_tree.xpath('/html/body/ul/li')
for item in list_items:
text = item.xpath('./text()')
print(text)