C# HTTP Helper for Web Scraping with Automatic Encoding Detection and Cookie Support

This utility class simplifies making HTTP requests in C# while automatically handling character encoding, gzip-compressed responses, cookies, and common request headers. It is especially useful for web scraping scenarios where the target page's encoding is unknown or inconsistent.

Core Features

  • Automatic detection of page encoding from HTML meta tags or response headers
  • Suport for both GET and POST requests
  • Transparent decompression of gzip-encoded responses
  • Cookie injection support via header manipulation
  • Fallback to UTF-8 or GBK when encodnig is ambiguous (e.g., ISO-8859-1)
  • Compatible with .NET Framework versions prior to 4.0

Implementation

using System;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

public class HttpHelper
{
    public Encoding Encoding { get; set; } = Encoding.Default;
    public bool ConvertToLower { get; set; } = true;

    private string _responseData = "Request Error";

    public string Request(string url, string method = "GET", string postData = null,
        string accept = "text/html,application/xhtml+xml,*/*",
        string contentType = "text/html",
        string userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    {
        var request = (HttpWebRequest)WebRequest.Create(EnsureScheme(url));
        request.Method = method.ToUpperInvariant();
        request.Accept = accept;
        request.ContentType = contentType;
        request.UserAgent = userAgent;
        request.AllowAutoRedirect = true;

        if (!string.IsNullOrEmpty(postData) && request.Method == "POST")
        {
            byte[] payload = Encoding.GetBytes(postData);
            request.ContentLength = payload.Length;
            using (var reqStream = request.GetRequestStream())
                reqStream.Write(payload, 0, payload.Length);
        }

        try
        {
            using (var response = (HttpWebResponse)request.GetResponse())
            {
                Stream responseStream = response.GetResponseStream();

                if (response.ContentEncoding?.IndexOf("gzip", StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
                }

                // Detect encoding if not explicitly set
                if (Encoding == Encoding.Default)
                {
                    var memoryStream = ToMemoryStream(responseStream);
                    byte[] rawBytes = memoryStream.ToArray();
                    string rawHtml = Encoding.Default.GetString(rawBytes);

                    // Try to extract charset from <meta> tag
                    var match = Regex.Match(rawHtml, @"<meta[^>]*charset\s*=\s*['""]?([^'""\s>]+)", 
                        RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string detectedCharset = match.Success ? match.Groups[1].Value : null;

                    if (!string.IsNullOrEmpty(detectedCharset))
                    {
                        detectedCharset = detectedCharset.Trim().ToLower()
                            .Replace("\"", "").Replace("'", "").Replace(";", "");
                        detectedCharset = detectedCharset == "iso-8859-1" ? "gbk" : detectedCharset;
                        Encoding = Encoding.GetEncoding(detectedCharset);
                    }
                    else if (!string.IsNullOrEmpty(response.CharacterSet))
                    {
                        string charSet = response.CharacterSet.ToLower().Trim();
                        Encoding = charSet == "iso-8859-1" 
                            ? Encoding.GetEncoding("gbk") 
                            : Encoding.GetEncoding(charSet);
                    }
                    else
                    {
                        Encoding = Encoding.UTF8;
                    }

                    _responseData = Encoding.GetString(rawBytes);
                }
                else
                {
                    using (var reader = new StreamReader(responseStream, Encoding))
                    {
                        _responseData = reader.ReadToEnd();
                    }
                }
            }
        }
        catch (WebException)
        {
            _responseData = "Request Error";
        }

        return ConvertToLower ? _responseData.ToLowerInvariant() : _responseData;
    }

    private static MemoryStream ToMemoryStream(Stream input)
    {
        var buffer = new byte[256];
        var ms = new MemoryStream();
        int bytesRead;
        while ((bytesRead = input.Read(buffer, 0, buffer.Length)) > 0)
        {
            ms.Write(buffer, 0, bytesRead);
        }
        return ms;
    }

    private static string EnsureScheme(string url)
    {
        if (!url.StartsWith("http://", StringComparison.OrdinalIgnoreCase) &&
            !url.StartsWith("https://", StringComparison.OrdinalIgnoreCase))
        {
            return "http://" + url;
        }
        return url;
    }

    // Convenience methods
    public string Get(string url, Encoding encoding = null)
    {
        Encoding = encoding ?? Encoding.Default;
        return Request(url, "GET");
    }

    public string Post(string url, string data, Encoding encoding = null)
    {
        Encoding = encoding ?? Encoding.Default;
        return Request(url, "POST", data);
    }

    public string GetWithCookie(string url, string cookieHeader, Encoding encoding = null)
    {
        Encoding = encoding ?? Encoding.Default;
        var request = (HttpWebRequest)WebRequest.Create(EnsureScheme(url));
        request.Method = "GET";
        request.Headers[HttpRequestHeader.Cookie] = cookieHeader;
        request.UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
        request.Accept = "text/html,application/xhtml+xml,*/*";
        request.ContentType = "text/html";
        request.AllowAutoRedirect = true;

        // Reuse core logic by temporarily assigning
        var originalRequest = this.GetType().GetField("request", 
            System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
        // Instead, refactor: better to extend rather than hack internal state.
        // For brevity, assume manual implementation similar to Request() above.
        // In practice, refactor SetRequest into protected method.

        // Simplified inline version:
        try
        {
            using (var resp = request.GetResponse())
            using (var stream = resp.GetResponseStream())
            {
                if (resp.Headers["Content-Encoding"]?.Contains("gzip") == true)
                {
                    using (var gzip = new GZipStream(stream, CompressionMode.Decompress))
                    using (var reader = new StreamReader(gzip, Encoding))
                        return reader.ReadToEnd();
                }
                else
                {
                    using (var reader = new StreamReader(stream, Encoding))
                        return reader.ReadToEnd();
                }
            }
        }
        catch
        {
            return "Request Error";
        }
    }
}

Usage Exapmles

// Basic GET request with auto-detected encoding
var helper = new HttpHelper();
string html = helper.Get("example.com/page");

// GET with explicit encoding
html = helper.Get("example.com/page", Encoding.GetEncoding("gb2312"));

// POST request
string result = helper.Post("https://api.example.com/submit", "param1=value1&param2=value2");

// GET with cookie (e.g., after login in WebBrowser)
string cookie = GetCookieFromWebBrowser("https://secure.example.com");
if (!string.IsNullOrEmpty(cookie))
{
    html = helper.GetWithCookie("https://secure.example.com/dashboard", cookie);
}

Extracting Cookies from WebBrowser Control

When using a WebBrowser control for authentication, cookies can be retrieved via P/Invoke:

[System.Runtime.InteropServices.DllImport("wininet.dll", CharSet = System.Runtime.InteropServices.CharSet.Auto)]
static extern bool InternetGetCookieEx(string url, string name, System.Text.StringBuilder data, 
    ref int size, int flags, IntPtr reserved);

const int INTERNET_COOKIE_HTTPONLY = 0x00002000;

public static string GetBrowserCookie(string url)
{
    int size = 256;
    var builder = new System.Text.StringBuilder(size);
    
    if (!InternetGetCookieEx(url, null, builder, ref size, INTERNET_COOKIE_HTTPONLY, IntPtr.Zero))
    {
        if (size > 0)
        {
            builder = new System.Text.StringBuilder(size);
            if (!InternetGetCookieEx(url, null, builder, ref size, INTERNET_COOKIE_HTTPONLY, IntPtr.Zero))
                return null;
        }
        else
        {
            return null;
        }
    }
    return builder.ToString();
}

Tags: C# HTTP web scraping Encoding Detection GZip Decompression

Posted on Fri, 15 May 2026 16:37:01 +0000 by RDKL PerFecT