Advanced Web Scraping Techniques for Rankings, Products, and Images

Extracting Structured Ranking Data

Utilizing requests alongside BeautifulSoup enables efficient extraction of tabular data from static web pages. The following implementation targets university ranking lists, parsing specific DOM elements to compile rank, institution name, location, type, and score.

import requests
from bs4 import BeautifulSoup

def fetch_university_rankings(target_url):
    request_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
    }
    
    response = requests.get(target_url, headers=request_headers)
    response.encoding = "utf-8"
    html_content = response.text
    
    parser = BeautifulSoup(html_content, 'lxml')
    
    rankings = []
    institutions = []
    locations = []
    types = []
    scores = []

    # Extract ranking positions
    rank_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(1)')
    for element in rank_elements:
        clean_text = element.get_text().split('\n')[1].strip()
        rankings.append(clean_text)

    # Extract institution names
    name_elements = parser.select('tr[data-v-389300f0] span[class="name-cn"]')
    for element in name_elements:
        clean_text = element.get_text().split('\n')[0]
        institutions.append(clean_text)

    # Extract provinces
    loc_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(3)')
    for element in loc_elements:
        clean_text = element.get_text().split('\n')[1].strip()
        locations.append(clean_text)

    # Extract types
    type_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(4)')
    for element in type_elements:
        clean_text = element.get_text().split('\n')[1].strip()
        types.append(clean_text)

    # Extract scores
    score_elements = parser.select('tr[data-v-389300f0] td[data-v-389300f0]:nth-child(5)')
    for element in score_elements:
        clean_text = element.get_text().split('\n')[1].strip()
        scores.append(clean_text)

    print(f'{"Rank":<6}{"Name":<15}{"Location":<10}{"Type":<10}{"Score":<10}')
    for i in range(min(30, len(rankings))):
        print(f'{rankings[i]:<6}{institutions[i]:<15}{locations[i]:<10}{types[i]:<10}{scores[i]:<10}')

if __name__ == "__main__":
    source_url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
    fetch_university_rankings(source_url)

Pattern Matching for Product Pricing

When dealing with e-commerce search results, regular expressions can isolate specific data points within raw HTML strings. This approcah demonstrates extracting product titles and prices from a search result page by identifying unique markers in the source code.

import requests
import re

def scrape_product_prices(search_url):
    try:
        header_info = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
        }
        resp = requests.get(search_url, headers=header_info)
        raw_html = resp.text
        
        # Locate the specific list container
        pattern = r'<ul class="bigimg cloth_shoplist" id="component_38">.*?</ul>'
        match = re.search(pattern, raw_html, re.DOTALL)
        
        if match:
            list_segment = match.group(0)
            print(f'{"ID":<5}{"Price":<10}{"Product Name"}')
            item_index = 1
            
            while re.search(r'<a title=" ', list_segment):
                # Extract Title
                title_start = re.search(r'<a title=" ', list_segment).end()
                title_end = re.search(r'  ddclick=', list_segment).start()
                product_title = list_segment[title_start:title_end - 1]
                
                # Extract Price
                price_start = re.search(r'<span class="price_n">', list_segment).end()
                price_end = re.search(r'</span>', list_segment).start()
                product_price = list_segment[price_start + 5:price_end]
                
                # Truncate processed segment
                list_segment = list_segment[re.search('</li>', list_segment).end():]
                
                if product_price and product_title:
                    print(f"{item_index:<5}{product_price:<10}{product_title}")
                    item_index += 1
                else:
                    print(f"{item_index:<5}Data Missing")
                    item_index += 1
    except Exception as e:
        print(f"Error occurred: {e}")

if __name__ == "__main__":
    target = 'https://search.dangdang.com/?key=%CA%E9%B0%FC&act=input&category_id=4003728&type=4003728&att=1000012%3A1873#J_tab'
    scrape_product_prices(target)

Concurrent Image Asset Retrieval

Downloading multiple media files benefits from concurrent execution. The following script initializes a local directory, traverses pagination links, identifies image sources matching specific extensions, and manages threads to handle downloads simultaneously.

import urllib.request
import os
import threading
from bs4 import BeautifulSoup

class MediaDownloader:
    def __init__(self, base_url):
        self.base_url = base_url
        self.page_count = 0
        self.item_count = 0
        self.active_threads = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
        }
        self.output_dir = 'download'

    def setup_directory(self):
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        else:
            for file in os.listdir(self.output_dir):
                os.remove(os.path.join(self.output_dir, file))

    def save_image(self, file_id, source_url, extension):
        try:
            file_path = os.path.join(self.output_dir, f'{file_id}.{extension}')
            urllib.request.urlretrieve(source_url, file_path)
        except Exception as e:
            print(e)

    def crawl(self, url):
        self.page_count += 1
        print(f'Page {self.page_count}: {url}')
        try:
            req = urllib.request.Request(url, headers=self.headers)
            resp = urllib.request.urlopen(req)
            html = resp.read().decode()
            parser = BeautifulSoup(html, 'lxml')
            
            containers = parser.select("div[class='n_right fr'] div[class='img slow']")
            for container in containers:
                img_tag = container.select_one('img')
                if img_tag:
                    src = urllib.request.urljoin(url, img_tag['src'])
                    ext = src[src.rfind('.') + 1:] if src.rfind('.') >= 0 else ''
                    
                    if ext.lower() in ["jpg", "png", "jpeg"]:
                        self.item_count += 1
                        file_id = '%06d' % self.item_count
                        thread = threading.Thread(target=self.save_image, args=[file_id, src, ext])
                        thread.start()
                        self.active_threads.append(thread)
            
            # Handle Pagination
            next_links = parser.select("div[class='pb_sys_common pb_sys_normal pb_sys_style2'] span[class='p_next p_fun'] a")
            for link in next_links:
                next_url = urllib.request.urljoin(url, link['href'])
                if next_url:
                    self.crawl(next_url)
        except Exception as e:
            print(e)

    def run(self):
        self.setup_directory()
        self.crawl(self.base_url)
        for t in self.active_threads:
            t.join()
        print(f'Total {self.page_count} pages, {self.item_count} items')

if __name__ == "__main__":
    starter_url = 'https://news.fzu.edu.cn/yxfd.htm'
    downloader = MediaDownloader(starter_url)
    downloader.run()

Tags: python web-scraping beautifulsoup regex Concurrency

Posted on Sat, 16 May 2026 07:09:57 +0000 by sarathi