c- 爬蟲 - 使用 ChromeDriver 所見即所得
問題
最近在做爬蟲的時候發現很多網頁都是瀏覽器看得見,但是源文件是看不到的,也就是所謂的異步加載。這時候如果我們需要那些異步內容,要麼是瞭解他的規則,進行條件的組合進而再次進行 http 請求,得到數據;這種方式有時候遇到邏輯複雜的就比較不好處理。這時候 ChromeDriver 就派上用場了。
辦法
下面我們來看下這個例子
爬取騰訊視頻,獲取電視劇或電影鏈接。
瀏覽器是這樣的
ChromeOptions options = new ChromeOptions();
options.AddArguments("--test-type", "--ignore-certificate-errors");
options.AddArguments("user-agent=mozilla/5.0 (linux; u; android 2.3.3; en-us; sdk build/ gri34) applewebkit/533.1 (khtml, like gecko) version/4.0 mobile safari/533.1");
options.AddArgument("enable-automation");
// options.AddArgument("headless");
// options.AddArguments("--proxy-server=http://user:password@yourProxyServer.com:8080");
// IWebDriver driver = new ChromeDriver(System.Environment.CurrentDirectory, options);
//chromeDriverService System.Environment.CurrentDirectory System.Environment.CurrentDirectory
using (IWebDriver driver = new OpenQA.Selenium.Chrome.ChromeDriver(@"C:\Users\Administrator\Downloads\chromedriver_win32", options, TimeSpan.FromSeconds(120)))
{
// trylogin(driver);
driver.Url = "http://v.qq.com/iframe/player.html?tiny=1&auto=0&vid=z0023uikqoj";
//tenvideo_video_player_0
SetText(driver.PageSource);
////Thread.Sleep(200);
////try
////{
//// for (int a = 1; a < 2; a++)
//// {
//// SetText("\r\n第" + a.ToString() + "個");
//// driver.Navigate().GoToUrl("https://s.1688.com/youyuan/index.htm?tab=imageSearch&imageType=oss&imageAddress=cbuimgsearch/eWXC7XHHPN1607529600000&spm=");
//// //登錄
//// if (driver.Url.Contains("login.1688.com"))
//// {
//// SetText("\r\n需要登錄,開始嘗試...");
//// trylogin(driver); //嘗試登錄完成
//// //再試試
//// driver.Navigate().GoToUrl("https://s.1688.com/youyuan/index.htm?tab=imageSearch&imageType=oss&imageAddress=cbuimgsearch/eWXC7XHHPN1607529600000&spm=");
//// if (driver.Url.Contains("login.1688.com"))
//// {
//// //沒辦法退出
//// SetText("\r\n退出,換ip重試...");
//// return;
//// }
//// }
//// //鼠標放上去的內容因爲頁面自帶只能顯示一個的原因 沒辦法做到全部顯示 然後在下載 只能是其他方式下載
//// // var elements = document.getElementsByClassName('hover-container');
//// // Array.prototype.forEach.call(elements, function(element) {
//// // element.style.display = "block";
//// // console.log(element);
//// // });
//// IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
//// var sss = js.ExecuteScript(" var elements = document.getElementsByClassName('hover-container'); Array.prototype.forEach.call(elements, function(element) { console.log(element); element.setAttribute(\"class\", \"測試title\"); element.style.display = \"block\"; console.log(element); });");
//// Thread.Sleep(500);
//// var responseModel = Write(driver.PageSource, Pagetypeenum.列表);
//// Thread.Sleep(500);
//// int i = 1;
//// foreach (var offer in responseModel?.data?.offerList ?? new List<OfferItemModel>())
//// {
//// driver.Navigate().GoToUrl(offer.information.detailUrl);
//// string responseDatadetail = driver.PageSource;
//// Write(driver.PageSource, Pagetypeenum.詳情);
//// SetText("\r\n第" + a.ToString() + "-" + i.ToString() + "個");
//// Thread.Sleep(500);
//// i++;
//// }
//// }
////}
////catch (Exception ex)
////{
//// CloseChromeDriver(driver);
//// throw;
////}
}
// Thread thread = new Thread(go);
// thread.Start();
}
得到網頁信息 SetText(driver.PageSource);
private void button2_Click(object sender, EventArgs e)
{
//文件路徑
string filePath = @"G:\conan\reptiles1688\bin\Debug\test.txt";
using (FileStream fsRead = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
int fsLen = (int)fsRead.Length;
byte[] heByte = new byte[fsLen];
fsRead.Read(heByte, 0, heByte.Length);
string myStr = System.Text.Encoding.Default.GetString(heByte);
this.textBox1.Text = myStr;///讀取
}
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(this.textBox1.Text);
HtmlNode node = doc.GetElementbyId("tenvideo_video_player_0");
textBox1.Text = node.Attributes["src"].Value;
// var node = doc.DocumentNode.SelectNodes("//video[@id='tenvideo_video_player_0']//video");
// textBox1.Text = (node[3].InnerHtml);
}
}
解析得到我們想到的視頻地址。
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/ft1Q1V56q97JnFYXY21s7Q