Extracting Structured Ranking Data
Utilizing requests alongside BeautifulSoup enables efficient extraction of tabular data from static web pages. The following implementation targets university ranking lists, parsing specific DOM elements to compile rank, institution name, location, type, and score.
import requests
from bs4 import BeautifulSoup
def fetch_university_rankings(target_url):
request_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
}
response = requests.get(target_url, headers=request_headers)
response.encoding = "utf-8"
html_content = response.text
parser = BeautifulSoup(html_content, 'lxml')
rankings = []
institutions = []
locations = []
types = []
scores = []
# Extract ranking positions
rank_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(1)')
for element in rank_elements:
clean_text = element.get_text().split('\n')[1].strip()
rankings.append(clean_text)
# Extract institution names
name_elements = parser.select('tr[data-v-389300f0] span[class="name-cn"]')
for element in name_elements:
clean_text = element.get_text().split('\n')[0]
institutions.append(clean_text)
# Extract provinces
loc_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(3)')
for element in loc_elements:
clean_text = element.get_text().split('\n')[1].strip()
locations.append(clean_text)
# Extract types
type_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(4)')
for element in type_elements:
clean_text = element.get_text().split('\n')[1].strip()
types.append(clean_text)
# Extract scores
score_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(5)')
for element in score_elements:
clean_text = element.get_text().split('\n')[1].strip()
scores.append(clean_text)
print(f'{"Rank":<6}{"Name":<15}{"Location":<10}{"Type":<10}{"Score":<10}')
for i in range(min(30, len(rankings))):
print(f'{rankings[i]:<6}{institutions[i]:<15}{locations[i]:<10}{types[i]:<10}{scores[i]:<10}')
if __name__ == "__main__":
source_url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
fetch_university_rankings(source_url)
Pattern Matching for Product Pricing
When dealing with e-commerce search results, regular expressions can isolate specific data points within raw HTML strings. This approcah demonstrates extracting product titles and prices from a search result page by identifying unique markers in the source code.
import requests
import re
def scrape_product_prices(search_url):
try:
header_info = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
}
resp = requests.get(search_url, headers=header_info)
raw_html = resp.text
# Locate the specific list container
pattern = r'<ul class="bigimg cloth_shoplist" id="component_38">.*?</ul>'
match = re.search(pattern, raw_html, re.DOTALL)
if match:
list_segment = match.group(0)
print(f'{"ID":<5}{"Price":<10}{"Product Name"}')
item_index = 1
while re.search(r'<a title=" ', list_segment):
# Extract Title
title_start = re.search(r'<a title=" ', list_segment).end()
title_end = re.search(r' ddclick=', list_segment).start()
product_title = list_segment[title_start:title_end - 1]
# Extract Price
price_start = re.search(r'<span class="price_n">', list_segment).end()
price_end = re.search(r'</span>', list_segment).start()
product_price = list_segment[price_start + 5:price_end]
# Truncate processed segment
list_segment = list_segment[re.search('</li>', list_segment).end():]
if product_price and product_title:
print(f"{item_index:<5}{product_price:<10}{product_title}")
item_index += 1
else:
print(f"{item_index:<5}Data Missing")
item_index += 1
except Exception as e:
print(f"Error occurred: {e}")
if __name__ == "__main__":
target = 'https://search.dangdang.com/?key=%CA%E9%B0%FC&act=input&category_id=4003728&type=4003728&att=1000012%3A1873#J_tab'
scrape_product_prices(target)
Concurrent Image Asset Retrieval
Downloading multiple media files benefits from concurrent execution. The following script initializes a local directory, traverses pagination links, identifies image sources matching specific extensions, and manages threads to handle downloads simultaneously.
import urllib.request
import os
import threading
from bs4 import BeautifulSoup
class MediaDownloader:
def __init__(self, base_url):
self.base_url = base_url
self.page_count = 0
self.item_count = 0
self.active_threads = []
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
}
self.output_dir = 'download'
def setup_directory(self):
if not os.path.exists(self.output_dir):
os.mkdir(self.output_dir)
else:
for file in os.listdir(self.output_dir):
os.remove(os.path.join(self.output_dir, file))
def save_image(self, file_id, source_url, extension):
try:
file_path = os.path.join(self.output_dir, f'{file_id}.{extension}')
urllib.request.urlretrieve(source_url, file_path)
except Exception as e:
print(e)
def crawl(self, url):
self.page_count += 1
print(f'Page {self.page_count}: {url}')
try:
req = urllib.request.Request(url, headers=self.headers)
resp = urllib.request.urlopen(req)
html = resp.read().decode()
parser = BeautifulSoup(html, 'lxml')
containers = parser.select("div[class='n_right fr'] div[class='img slow']")
for container in containers:
img_tag = container.select_one('img')
if img_tag:
src = urllib.request.urljoin(url, img_tag['src'])
ext = src[src.rfind('.') + 1:] if src.rfind('.') >= 0 else ''
if ext.lower() in ["jpg", "png", "jpeg"]:
self.item_count += 1
file_id = '%06d' % self.item_count
thread = threading.Thread(target=self.save_image, args=[file_id, src, ext])
thread.start()
self.active_threads.append(thread)
# Handle Pagination
next_links = parser.select("div[class='pb_sys_common pb_sys_normal pb_sys_style2'] span[class='p_next p_fun'] a")
for link in next_links:
next_url = urllib.request.urljoin(url, link['href'])
if next_url:
self.crawl(next_url)
except Exception as e:
print(e)
def run(self):
self.setup_directory()
self.crawl(self.base_url)
for t in self.active_threads:
t.join()
print(f'Total {self.page_count} pages, {self.item_count} items')
if __name__ == "__main__":
starter_url = 'https://news.fzu.edu.cn/yxfd.htm'
downloader = MediaDownloader(starter_url)
downloader.run()