CSharp 爬蟲 - Selenium ChromeDriver 設置代理

背景

開發爬蟲程序,如果不做代理設置,本機的外網 IP 很容易被網站封掉,導致不能持續進行數據抓取。而 Selenium 作爲動態網頁抓取的利器,我們有必要了解一下,如何對它進行代理設置,並正常訪問網頁。

解決辦法

1、首先申請代理 ip, 正常付費的才比較靠譜。這其中包括賬號、密碼。

  private string proxy_Host = "域名地址";
        private int proxy_Post = 端口;
        private string proxy_UserName = "賬號";
        private string proxy_PassWord = "密碼";
        private string proxy_CheckURL = "檢查是否正常的地址";
        private string Ex_Proxy_Name = "proxy.zip";

2、設置 chrome background.js、manifest.json

  private bool Rebuild_Extension_Proxy(string proxy_UserName, string proxy_PassWord)
        {
            bool result = false;
            FileStream zipToOpen = null;
            ZipArchive archive = null;
            ZipArchiveEntry readmeEntry = null;
            StreamWriter writer = null;
            string background = "";
            string manifest = "";
            try
            {
                background = @"
                var Global = {
                    currentProxyAouth:
                    {
                        username: '',
                        password: ''
                    }
                }
                Global.currentProxyAouth = {
                        username: '" + proxy_UserName + @"',
                        password: '" + proxy_PassWord + @"'
                }
                chrome.webRequest.onAuthRequired.addListener(
                    function(details, callbackFn) {
                        console.log('onAuthRequired >>>: ', details, callbackFn);
                        callbackFn({
                            authCredentials: Global.currentProxyAouth
                        });
                    }, {
                        urls: [""<all_urls>""]
                    }, [""asyncBlocking""]);
                chrome.runtime.onMessage.addListener(
                    function(request, sender, sendResponse) {
                        console.log('Background recieved a message: ', request);
                        POPUP_PARAMS = {};
                        if (request.command && requestHandler[request.command])
                            requestHandler[request.command] (request);
                    }
                );";
                manifest = @"
                {
                    ""version"": ""1.0.0"",
                    ""manifest_version"": 2,
                    ""name"": ""Chrome Proxy"",
                    ""permissions"": [
                        ""proxy"",
                        ""tabs"",
                        ""unlimitedStorage"",
                        ""storage"",
                        ""<all_urls>"",
                        ""webRequest"",
                        ""webRequestBlocking""
                    ],
                    ""background"": {
                        ""scripts"": [""background.js""]
                    },
                    ""minimum_chrome_version"":""22.0.0""
                }";
                zipToOpen = new FileStream(System.Environment.CurrentDirectory + "\\" + Ex_Proxy_Name, FileMode.Create);
                archive = new ZipArchive(zipToOpen, ZipArchiveMode.Update);
                readmeEntry = archive.CreateEntry("background.js");
                writer = new StreamWriter(readmeEntry.Open());
                writer.WriteLine(background);
                writer.Close();
                readmeEntry = archive.CreateEntry("manifest.json");
                writer = new StreamWriter(readmeEntry.Open());
                writer.WriteLine(manifest);
                writer.Close();
                result = true;
            }
            catch (Exception ex)
            {
                result = false;
            }
            finally
            {
                if (writer != null) { writer.Close(); writer.Dispose(); writer = null; }
                if (readmeEntry != null) { readmeEntry = null; }
                if (archive != null) { archive.Dispose(); archive = null; }
                if (zipToOpen != null) { zipToOpen.Close(); zipToOpen.Dispose(); zipToOpen = null; }
            }
            return result;
        }

3、Chrome Driver 使用代理 Proxy

 // 設置 Chrome Driver Exyension Proxy 設定
                bool isproxysetting = true;
                if (_isuseproxy)
                {
                    isproxysetting = Rebuild_Extension_Proxy(proxy_UserName, proxy_PassWord);
                }
                if (isproxysetting)
                {
                    // Driver 設定
                    options = new ChromeOptions();
                    if (_isuseproxy)
                    {
                        options.Proxy = null;
                        options.AddArguments("--proxy-server=" + proxy_Host + ":" + proxy_Post.ToString());
                        options.AddExtension(Ex_Proxy_Name);
                    }

4、測試一下我們的設置

  private Proxy_Unit.ProxyIPInfo Get_ProxyIPInfo(string Html_Content)
        {
            Proxy_Unit.ProxyIPInfo result = null;
            try
            {
                result = new Proxy_Unit.ProxyIPInfo();
                Html_Content = Html_Content.Replace("<html><head></head><body><pre style=\"word-wrap: break-word; white-space: pre-wrap;\">", "");
                Html_Content = Html_Content.Replace("</pre></body></html>", "");
                if (!Html_Content.Contains("proxy error"))
                {
                    result = JsonConvert.DeserializeObject<Proxy_Unit.ProxyIPInfo>(Html_Content);
                }
                else
                {
                    result = null;
                }
            }
            catch (Exception ex)
            {
                result = null;
            }
            return result;
        }

測試效果

成功,達到預期效果

{
    "ip":"213.182.205.185",
    "country":"IS",
    "asn":{
        "asnum":9009,
        "org_name":"M247 Ltd"
    },
    "geo":{
        "city":"Reykjavik",
        "region":"1",
        "region_name":"Capital Region",
        "postal_code":"105",
        "latitude":64.1369,
        "longitude":-21.9139,
        "tz":"Atlantic/Reykjavik",
        "lum_city":"reykjavik",
        "lum_region":"1"
    }
}

總結

我們之前測試要爲 ChromeDriver 設定 Proxy 時有遇到許多困難,需要使用 Chrome Extension 的管道設定 Proxy 才成功,以上希望能讓您比較好了解。

本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/nyhIMInHZa1yOrwJu1lImg