CSharp 爬蟲 - Selenium ChromeDriver 設置代理
背景
開發爬蟲程序,如果不做代理設置,本機的外網 IP 很容易被網站封掉,導致不能持續進行數據抓取。而 Selenium 作爲動態網頁抓取的利器,我們有必要了解一下,如何對它進行代理設置,並正常訪問網頁。
解決辦法
1、首先申請代理 ip, 正常付費的才比較靠譜。這其中包括賬號、密碼。
private string proxy_Host = "域名地址";
private int proxy_Post = 端口;
private string proxy_UserName = "賬號";
private string proxy_PassWord = "密碼";
private string proxy_CheckURL = "檢查是否正常的地址";
private string Ex_Proxy_Name = "proxy.zip";
2、設置 chrome background.js、manifest.json
private bool Rebuild_Extension_Proxy(string proxy_UserName, string proxy_PassWord)
{
bool result = false;
FileStream zipToOpen = null;
ZipArchive archive = null;
ZipArchiveEntry readmeEntry = null;
StreamWriter writer = null;
string background = "";
string manifest = "";
try
{
background = @"
var Global = {
currentProxyAouth:
{
username: '',
password: ''
}
}
Global.currentProxyAouth = {
username: '" + proxy_UserName + @"',
password: '" + proxy_PassWord + @"'
}
chrome.webRequest.onAuthRequired.addListener(
function(details, callbackFn) {
console.log('onAuthRequired >>>: ', details, callbackFn);
callbackFn({
authCredentials: Global.currentProxyAouth
});
}, {
urls: [""<all_urls>""]
}, [""asyncBlocking""]);
chrome.runtime.onMessage.addListener(
function(request, sender, sendResponse) {
console.log('Background recieved a message: ', request);
POPUP_PARAMS = {};
if (request.command && requestHandler[request.command])
requestHandler[request.command] (request);
}
);";
manifest = @"
{
""version"": ""1.0.0"",
""manifest_version"": 2,
""name"": ""Chrome Proxy"",
""permissions"": [
""proxy"",
""tabs"",
""unlimitedStorage"",
""storage"",
""<all_urls>"",
""webRequest"",
""webRequestBlocking""
],
""background"": {
""scripts"": [""background.js""]
},
""minimum_chrome_version"":""22.0.0""
}";
zipToOpen = new FileStream(System.Environment.CurrentDirectory + "\\" + Ex_Proxy_Name, FileMode.Create);
archive = new ZipArchive(zipToOpen, ZipArchiveMode.Update);
readmeEntry = archive.CreateEntry("background.js");
writer = new StreamWriter(readmeEntry.Open());
writer.WriteLine(background);
writer.Close();
readmeEntry = archive.CreateEntry("manifest.json");
writer = new StreamWriter(readmeEntry.Open());
writer.WriteLine(manifest);
writer.Close();
result = true;
}
catch (Exception ex)
{
result = false;
}
finally
{
if (writer != null) { writer.Close(); writer.Dispose(); writer = null; }
if (readmeEntry != null) { readmeEntry = null; }
if (archive != null) { archive.Dispose(); archive = null; }
if (zipToOpen != null) { zipToOpen.Close(); zipToOpen.Dispose(); zipToOpen = null; }
}
return result;
}
3、Chrome Driver 使用代理 Proxy
// 設置 Chrome Driver Exyension Proxy 設定
bool isproxysetting = true;
if (_isuseproxy)
{
isproxysetting = Rebuild_Extension_Proxy(proxy_UserName, proxy_PassWord);
}
if (isproxysetting)
{
// Driver 設定
options = new ChromeOptions();
if (_isuseproxy)
{
options.Proxy = null;
options.AddArguments("--proxy-server=" + proxy_Host + ":" + proxy_Post.ToString());
options.AddExtension(Ex_Proxy_Name);
}
4、測試一下我們的設置
private Proxy_Unit.ProxyIPInfo Get_ProxyIPInfo(string Html_Content)
{
Proxy_Unit.ProxyIPInfo result = null;
try
{
result = new Proxy_Unit.ProxyIPInfo();
Html_Content = Html_Content.Replace("<html><head></head><body><pre style=\"word-wrap: break-word; white-space: pre-wrap;\">", "");
Html_Content = Html_Content.Replace("</pre></body></html>", "");
if (!Html_Content.Contains("proxy error"))
{
result = JsonConvert.DeserializeObject<Proxy_Unit.ProxyIPInfo>(Html_Content);
}
else
{
result = null;
}
}
catch (Exception ex)
{
result = null;
}
return result;
}
測試效果
成功,達到預期效果
{
"ip":"213.182.205.185",
"country":"IS",
"asn":{
"asnum":9009,
"org_name":"M247 Ltd"
},
"geo":{
"city":"Reykjavik",
"region":"1",
"region_name":"Capital Region",
"postal_code":"105",
"latitude":64.1369,
"longitude":-21.9139,
"tz":"Atlantic/Reykjavik",
"lum_city":"reykjavik",
"lum_region":"1"
}
}
總結
我們之前測試要爲 ChromeDriver 設定 Proxy 時有遇到許多困難,需要使用 Chrome Extension 的管道設定 Proxy 才成功,以上希望能讓您比較好了解。
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/nyhIMInHZa1yOrwJu1lImg