golang 源碼分析: chromedp
chromedp 是 go 寫的,支持 Chrome DevTools Protocol 的一個驅動瀏覽器的庫。https://github.com/chromedp/chromedp。隨着前端 spa 應用的普及,傳統的爬蟲很難抓取到我們想要的內容,Chrome DevTools Protocol (CDP) 提供了一個完整的瀏覽器接口,使得我們可以用瀏覽器一樣的環境來模擬請求來抓取動態生成的網頁。所謂 CDP 的協議,本質上是什麼呢?本質上是基於 websocket 的一種協議。
下面我們通過兩個例子看下如何使用 chromedp:
// Command click is a chromedp example demonstrating how to use a selector to
// click on an element.
package main
import (
"context"
"log"
"time"
"github.com/chromedp/chromedp"
)
func main() {
// create chrome instance
ctx, cancel := chromedp.NewContext(
context.Background(),
// chromedp.WithDebugf(log.Printf),
)
defer cancel()
// create a timeout
ctx, cancel = context.WithTimeout(ctx, 15*time.Second)
defer cancel()
// navigate to a page, wait for an element, click
var example string
err := chromedp.Run(ctx,
chromedp.Navigate(`https://pkg.go.dev/time`),
// wait for footer element is visible (ie, page is loaded)
chromedp.WaitVisible(`body > footer`),
// find and click "Example" link
chromedp.Click(`#example-After`, chromedp.NodeVisible),
// retrieve the text of the textarea
chromedp.Value(`#example-After textarea`, &example),
)
if err != nil {
log.Fatal(err)
}
log.Printf("Go's time.After example:\n%s", example)
}
返回結果如下:
2023/07/02 23:29:57 Go's time.After example:
package main
import (
"fmt"
"time"
)
var c chan int
func handle(int) {}
func main() {
select {
case m := <-c:
handle(m)
case <-time.After(10 * time.Second):
fmt.Println("timed out")
}
}
完全模擬了一個瀏覽器的點擊請求,返回我們想要的內容。當然也可以用來種植 cookie,
// Command cookie is a chromedp example demonstrating how to set a HTTP cookie
// on requests.
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"net/http"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/cdproto/storage"
"github.com/chromedp/chromedp"
)
func main() {
port := flag.Int("port", 8544, "port")
flag.Parse()
// start cookie server
go cookieServer(fmt.Sprintf(":%d", *port))
// create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
// run task list
var res string
err := chromedp.Run(ctx, setcookies(
fmt.Sprintf("http://localhost:%d", *port), &res,
"cookie1", "value1",
"cookie2", "value2",
))
if err != nil {
log.Fatal(err)
}
log.Printf("chrome received cookies: %s", res)
}
// cookieServer creates a simple HTTP server that logs any passed cookies.
func cookieServer(addr string) error {
mux := http.NewServeMux()
mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
cookies := req.Cookies()
for i, cookie := range cookies {
log.Printf("from %s, server received cookie %d: %v", req.RemoteAddr, i, cookie)
}
buf, err := json.MarshalIndent(req.Cookies(), "", " ")
if err != nil {
http.Error(res, err.Error(), http.StatusInternalServerError)
return
}
fmt.Fprintf(res, indexHTML, string(buf))
})
return http.ListenAndServe(addr, mux)
}
// setcookies returns a task to navigate to a host with the passed cookies set
// on the network request.
func setcookies(host string, res *string, cookies ...string) chromedp.Tasks {
if len(cookies)%2 != 0 {
panic("length of cookies must be divisible by 2")
}
return chromedp.Tasks{
chromedp.ActionFunc(func(ctx context.Context) error {
// create cookie expiration
expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
// add cookies to chrome
for i := 0; i < len(cookies); i += 2 {
err := network.SetCookie(cookies[i], cookies[i+1]).
WithExpires(&expr).
WithDomain("localhost").
WithHTTPOnly(true).
Do(ctx)
if err != nil {
return err
}
}
return nil
}),
// navigate to site
chromedp.Navigate(host),
// read the returned values
chromedp.Text(`#result`, res, chromedp.ByID, chromedp.NodeVisible),
// read network values
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := storage.GetCookies().Do(ctx)
if err != nil {
return err
}
for i, cookie := range cookies {
log.Printf("chrome cookie %d: %+v", i, cookie)
}
return nil
}),
}
}
const (
indexHTML = `<!doctype html>
<html>
<body>
<div>%s</div>
</body>
</html>`
)
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/PYJF-N-a9L_NIvYMPZ-SQQ