The core mechanism of an image scraper involves three steps: fetching a webpage, parsing its HTML to extract image URLs, and downloading each image file. Below are two practical examples—one for a general gallery site and another for a specific article. These scripts rely on Python's urllib.request and re modules and were originally written in 2020; the URLs may no longer be active.
1. Scrapign images from a gallery page
import urllib.request
import re
class ImageScraper:
def __init__(self, base_url, user_agent):
self.base_url = base_url
self.user_agent = user_agent
def fetch_page(self):
req = urllib.request.Request(self.base_url)
req.add_header("User-Agent", self.user_agent)
response = urllib.request.urlopen(req)
return response.read()
def extract_image_urls(self):
raw_urls = []
pattern = re.compile(b"https://pit1\\.maozhew\\.com/forum/.{32}\\.(?:jpg|jpeg|png)")
matches = pattern.findall(self.fetch_page())
for match in matches:
raw_urls.append(match.decode("utf-8"))
return raw_urls
def download_images(self):
counter = 0
for img_url in self.extract_image_urls():
counter += 1
self.base_url = img_url # reuse fetch with new URL
with open(f"{counter}.jpg", "wb") as f:
f.write(self.fetch_page())
scraper = ImageScraper(
"https://www.topit.pro/",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
)
scraper.download_images()
The code defines a class that bundles fetching, parsing, and downloading. The regular expression looks for image URLs matching the gallery's pattern.
2. Scraping images from a news article
import urllib.request
import re
class ImageScraper:
def __init__(self, base_url, user_agent):
self.base_url = base_url
self.user_agent = user_agent
def fetch_page(self):
req = urllib.request.Request(self.base_url)
req.add_header("User-Agent", self.user_agent)
response = urllib.request.urlopen(req)
return response.read()
def extract_image_urls(self):
raw_urls = []
pattern = re.compile(b"http://5b0988e595225\\.cdn\\.sohucs\\.com/images/.{41}\\.(?:jpg|jpeg|png)")
matches = pattern.findall(self.fetch_page())
for match in matches:
raw_urls.append(match.decode("utf-8"))
return raw_urls
def download_images(self):
counter = 0
for img_url in self.extract_image_urls():
counter += 1
self.base_url = img_url
with open(f"{counter}.jpg", "wb") as f:
f.write(self.fetch_page())
scraper = ImageScraper(
"https://www.sohu.com/a/255373346_721493",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
)
scraper.download_images()
This version adjusts the regex pattern to match the article's image CDN. Both scripts follow the same structure: request the page, extract URLs with regex, and write each image to a local file.