Web Scraping Animated Images with Python
Automatically collecting animated images from websites can be useful when manual downloading is cumbersome, especially when websites restrict right-click functionality. This guide demonstrates how to create a Python script to extract GIFs from online sources.
We'll be scraping images from "FunnyGIFs", a humor-focused website featuring animated content.
Approach
- Fetch the HTML content of the target webpage
- Parse the HTML to locate URLs of animated images
- Download these images to local storage
- Implement pagination to scrape multiple pages
Implementation
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib.request
import time
import uuid
import os
import sys
import re
from bs4 import BeautifulSoup
def fetch_page(url):
"""
Retrieve HTML content from the specified URL
"""
try:
print(f"Fetching: {url}")
response = urllib.request.urlopen(url)
html = response.read()
return html
except Exception as e:
print(f"Error fetching page: {e}")
return None
def extract_image_urls(html):
"""
Extract image URLs and metadata from the parsed HTML
"""
if not html:
print('No content to process')
return []
image_list = []
soup = BeautifulSoup(html, 'lxml')
# Find all content items on the page
content_items = soup.find("div", {"class": "main"}).find_all('div', {'class': 'item'})
for item in content_items:
# Filter out advertisement content
if item.find('div', {"class": "text"}):
image_data = {}
# Extract image URL
image_url = item.find('div', {"class": "text"}).find('img').get('src')
image_data['url'] = image_url
# Extract image title/name
image_data['name'] = item.find('h3').text.strip()
image_list.append(image_data)
return image_list
def save_image(image_name, image_url, category, page_number):
"""
Download and save an image to the local filesystem
"""
# Create directory structure based on current date
current_time = time.localtime(time.time())
date_folder = f"{current_time.tm_year}-{current_time.tm_mon}-{current_time.tm_mday}"
# Define full path for the image
image_dir = f"GIF_Collection/{date_folder}/{category}/{page_number}"
unique_id = uuid.uuid1()
file_extension = image_url.split('.')[-1]
file_path = f"{image_dir}/{image_name}_{unique_id}.{file_extension}"
# Create directories if they don't exist
os.makedirs(image_dir, exist_ok=True)
print(f"Saving image to: {file_path}")
urllib.request.urlretrieve(image_url, file_path)
print(f"Source URL: {image_url}")
return file_path
def exit_program():
"""
Cleanly exit the program
"""
print("Exiting scraper...")
sys.exit(0)
def scrape_page(page_number):
"""
Main function to scrape a specific page
"""
target_url = f"http://www.funnygifs.com/animations/list_4_{page_number}.html"
html_content = fetch_page(target_url)
image_urls = extract_image_urls(html_content)
for image_data in image_urls:
save_image(
image_data['name'],
image_data['url'],
'Funny_Animations',
page_number
)
if __name__ == '__main__':
print("""
*****************************************
** GIF Web Scraper Program **
** Python Implementation **
*****************************************""")
# Get initial page input
page_input = input("Enter page number to scrape (1-50), or 'quit' to exit: ")
# Validate initial input
while not page_input.isdigit() or not (1 <= int(page_input) <= 50):
if page_input.lower() == 'quit':
exit_program()
print("Invalid input. Please enter a number between 1-50.")
page_input = input("Enter page number to scrape: ")
# Scrape the initial page
scrape_page(page_input)
# Ask for additional pages to scrape
additional_pages = input("Enter number of additional pages to scrape (1-5000), or 'quit' to exit: ")
# Validate additional pages input
while not additional_pages.isdigit() or not (1 <= int(additional_pages) <= 5000):
if additional_pages.lower() == 'quit':
exit_program()
print("Invalid input. Please enter a number between 1-5000.")
additional_pages = input("Enter number of additional pages to scrape: ")
# Scrape multiple pages
for page_num in range(1, int(additional_pages) + 1):
scrape_page(page_num)
Sample Output
*****************************************
** GIF Web Scraper Program **
** Python Implementation **
*****************************************
Enter page number to scrape (1-50), or 'quit' to exit: 1
1
Fetching: http://www.funnygifs.com/animations/list_4_1.html
Saving image to: GIF_Collection/2023-5-10/Funny_Animations/1/Tough_choices_5f0fe8f6-09f8-11e7-9161-f8bc12753d1e.gif
Source URL: http://www.funnygifs.com/uploads/allimg/170206/10-1F206135ZHJ.gif
Saving image to: GIF_Collection/2023-5-10/Funny_Animations/1/That_will_end_badly_3fa9da88-09f8-11e7-9161-f8bc12753d1e.gif
Source URL: http://www.funnygifs.com/uploads/allimg/170206/10-1F206135H35U.gif
Saving image to: GIF_Collection/2023-5-10/Funny_Animations/1/Definitely_Indian_4064e60c-09f8-11e7-9161-f8bc12753d1e.gif
Source URL: http://www.funnygifs.com/uploads/allimg/170206/10-1F20613543c50.gif
Saving image to: GIF_Collection/2023-5-10/Funny_Animations/1/Genuine_work_face_414b4f52-09f8-11e7-9161-f8bc12753d1e.gif
Source URL: http://www.funnygifs.com/uploads/allimg/170206/10-1F206135250553.gif
Saving image to: GIF_Collection/2023-5-10/Funny_Animations/1/What_is_she_shaking_421afa86-09f8-11e7-9161-f8bc12753d1e.gif
Source URL: http://www.funnygifs.com/uploads/allimg/170206/10-1F20613493N03.gif
Enter number of additional pages to scrape (1-5000), or 'quit' to exit: quit
Exiting scraper...
After execution, the animated images will be saved in organized directories on your local system.