Automated GUI Slider Captcha Resolution Using Computer Vision and UI Event Simulation

The core pipeline for resolving slider-based graphical verification challenges relies on capturing a static interface region, isolating the draggable element from the backgorund track, extracting structural features through edge detection, and correlating spatial offsets via template matching. Final displacement values must be calibrated against platform-specific rendering offsets.

Core Processing Workflow

  1. Region Acquisition: Capture a bounded screen area containing both the puzzle piece and the target groove.
  2. ROI Segmentation: Split the grayscale frame into two subsets: the foreground fragment and the residual bakcground matrix.
  3. Feature Extraction: Apply Gaussian smoothing to suppress texture noise, followed by Canny thresholding to isolate high-contrast boundaries.
  4. Spatial Correlation: Execute normalized cross-correlation (TM_CCOEFF_NORMED) to locate the optimal alignment coordinate.
  5. Input Simulation: Translate the calculated offset into mouse events, incorporating micro-adjustments to mimic human interaction pattterns.

Reference Implementation (Basic)

import pyautogui as pg
import cv2
import time

# Calibration constant for specific DOM layouts
LAYOUT_OFFSET = 10

def execute_captcha_solve():
    time.sleep(3)
    # Define capture boundary: (x, y, width, height)
    capture_rect = (790, 391, 320, 200)
    region_img = pg.screenshot(region=capture_rect)
    region_img.save("capture_region.png")

    # Load as monochrome for consistent edge analysis
    raw_frame = cv2.imread("capture_region.png", cv2.IMREAD_GRAYSCALE)
    h, w = raw_frame.shape[:2]

    # Segment puzzle piece and background track
    piece_width = 50
    piece_roi = raw_frame[0:h, 0:piece_width]
    bg_roi = raw_frame[0:h, piece_width:w]

    # Preprocessing reduces high-frequency noise
    blurred_piece = cv2.GaussianBlur(piece_roi, (5, 5), 0)
    blurred_bg = cv2.GaussianBlur(bg_roi, (5, 5), 0)

    # Boundary highlighting
    piece_edges = cv2.Canny(blurred_piece, 100, 200)
    bg_edges = cv2.Canny(blurred_bg, 100, 200)

    # Template correlation to find overlap position
    match_result = cv2.matchTemplate(bg_edges, piece_edges, cv2.TM_CCOEFF_NORMED)
    _, max_confidence, _, max_location = cv2.minMaxLoc(match_result)

    # Calculate final displacement from top-left anchor
    calculated_offset = max_location[0] + piece_width
    actual_drag_distance = calculated_offset + LAYOUT_OFFSET

    # Simulate precise mouse interaction
    target_button_pos = (821, 633)
    pg.moveTo(target_button_pos, duration=0.3)
    time.sleep(0.1)
    pg.mouseDown()
    pg.moveRel(actual_drag_distance, 0, duration=0.3, tween=pg.easeInOutQuad)
    pg.moveRel(-5, 0, duration=0.1, tween=pg.easeInOutQuad)  # Micro-adjustment
    pg.moveRel(5, 0, duration=0.1, tween=pg.easeInOutQuad)
    pg.mouseUp()

    # Visualization
    debug_img = raw_frame.copy()
    cv2.rectangle(debug_img, (max_location[0] + piece_width, 0), (max_location[0], h), (0, 0, 255), 1)
    cv2.imshow("Processed Regions", debug_img)
    cv2.waitKey(0)
    return actual_drag_distance

if __name__ == "__main__":
    execute_captcha_solve()

Auxiliary Utilities

Coordinate Mapping Helper Captures active cursor positions after a brief initialization delay.

import pyautogui as pg
import time

time.sleep(5)
x, y = pg.position()
print(f"Target coordinates: x={x}, y={y}")

Drag Simulation Routine Standardized motion control with deceleration easing and release handling.

import pyautogui as pg
import time

TARGET_DISTANCE = 192

time.sleep(3)
pg.moveTo(821, 633, duration=0.3)
time.sleep(0.1)
pg.mouseDown()
pg.moveRel(TARGET_DISTANCE, 0, duration=0.3, tween=pg.easeInOutQuad)
pg.moveRel(-5, 0, duration=0.1, tween=pg.easeInOutQuad)
pg.mouseUp()

Interactive Region Selector Records two click points triggered by a keyboard shortcut, computes bounding dimensions, and exports the ROI.

import pyautogui
import cv2
from pynput.keyboard import Key, Listener as KListener, Controller as KController
from pynput.mouse import Button, Listener as MListener, Controller as MController

TRIGGER_KEY = Key.ctrl_l
recorded_coords = []

def handle_mouse_click(x, y, button, pressed):
    if pressed:
        recorded_coords.extend([x, y])
    else:
        return False

def handle_key_release(key):
    if key == TRIGGER_KEY:
        return False

if __name__ == "__main__":
    with KListener(on_release=handle_key_release) as k_listener:
        k_listener.join()

    print("Awaiting two click points...")
    with MListener(on_click=handle_mouse_click) as m_listener:
        m_listener.join()

    x1, y1, x2, y2 = recorded_coords
    if abs(x1 - x2) < 2 or abs(y1 - y2) < 2:
        raise ValueError("Invalid selection region.")

    start_x = min(x1, x2)
    start_y = min(y1, y2)
    cap_w = abs(x1 - x2)
    cap_h = abs(y1 - y2)

    snapshot = pyautogui.screenshot(region=[start_x, start_y, cap_w, cap_h])
    snapshot.save("selected_area.jpg")

Optimized Implementation (Advanced)

This version introduces multi-threshold evaluation loops and a statistical fallback mechanism to maintain robustness across varying image quality levels. Processing latency typically falls below 25ms.

import cv2
import time
from collections import Counter

START_TIME = time.time()
raw_frame = cv2.imread("capture_region.png", cv2.IMREAD_GRAYSCALE)
h, w = raw_frame.shape[:2]

# Configuration presets: (lower_threshold, upper_threshold)
THRESHOLD_PRESETS = [
    (200, 400), (150, 250), (100, 200),  # Standard
    (10, 150), (80, 150), (10, 80),      # Low contrast
    (10, 30)                             # Ultra-low contrast
]

piece_width = 50
piece_roi = raw_frame[0:h, 0:piece_width]
bg_roi = raw_frame[0:h, piece_width:w]

blurred_piece = cv2.GaussianBlur(piece_roi, (5, 5), 1)
blurred_bg = cv2.GaussianBlur(bg_roi, (5, 5), 1)

confidence_scores = []
candidate_offsets = []
best_match_coord = 0

for lower, upper in THRESHOLD_PRESETS:
    piece_edge = cv2.Canny(blurred_piece, lower, upper)
    bg_edge = cv2.Canny(blurred_bg, lower, upper)
    
    match_data = cv2.matchTemplate(bg_edge, piece_edge, cv2.TM_CCOEFF_NORMED)
    _, conf, _, loc = cv2.minMaxLoc(match_data)
    
    confidence_scores.append(conf)
    current_offset = loc[0] + piece_width
    candidate_offsets.append(current_offset)
    best_match_coord = current_offset if conf > 0 else best_match_coord

# Evaluate primary result
primary_conf = max(confidence_scores)
primary_idx = confidence_scores.index(primary_conf)
final_offset = candidate_offsets[primary_idx]

# Fallback aggregation when correlation drops below threshold
if primary_conf < 0.15:
    coord_freq = Counter(candidate_offsets)
    final_offset = coord_freq.most_common(1)[0][0]

elapsed_ms = round((time.time() - START_TIME) * 1000, 2)
print(f"Peak Confidence: {primary_conf:.4f} | Final Offset: {final_offset} | Latency: {elapsed_ms}ms")

# Debug visualization
draw_canvas = raw_frame.copy()
cv2.rectangle(draw_canvas, (int(final_offset - piece_width), 0), (int(final_offset), h), (0, 0, 255), 2)

cv2.imshow("Primary Frame", raw_frame)
cv2.imshow("Debug Overlay", draw_canvas)
cv2.waitKey(0)

Parameter tuning should prioritize gradient sensitivity over pixel intensity. Edge thresholds may require adjustment based on background complexity and lighting conditions within the target application window.

Tags: python opencvcv pyautogui captcha-solving computer-vision

Posted on Wed, 13 May 2026 00:47:10 +0000 by jeva39