golang 源碼分析: chromedp

        chromedp 是 go 寫的,支持 Chrome DevTools Protocol 的一個驅動瀏覽器的庫。https://github.com/chromedp/chromedp。隨着前端 spa 應用的普及,傳統的爬蟲很難抓取到我們想要的內容,Chrome DevTools Protocol (CDP) 提供了一個完整的瀏覽器接口,使得我們可以用瀏覽器一樣的環境來模擬請求來抓取動態生成的網頁。所謂 CDP 的協議,本質上是什麼呢?本質上是基於 websocket 的一種協議。

        下面我們通過兩個例子看下如何使用 chromedp:

// Command click is a chromedp example demonstrating how to use a selector to
// click on an element.
package main
import (
  "context"
  "log"
  "time"
  "github.com/chromedp/chromedp"
)
func main() {
  // create chrome instance
  ctx, cancel := chromedp.NewContext(
    context.Background(),
    // chromedp.WithDebugf(log.Printf),
  )
  defer cancel()
  // create a timeout
  ctx, cancel = context.WithTimeout(ctx, 15*time.Second)
  defer cancel()
  // navigate to a page, wait for an element, click
  var example string
  err := chromedp.Run(ctx,
    chromedp.Navigate(`https://pkg.go.dev/time`),
    // wait for footer element is visible (ie, page is loaded)
    chromedp.WaitVisible(`body > footer`),
    // find and click "Example" link
    chromedp.Click(`#example-After`, chromedp.NodeVisible),
    // retrieve the text of the textarea
    chromedp.Value(`#example-After textarea`, &example),
  )
  if err != nil {
    log.Fatal(err)
  }
  log.Printf("Go's time.After example:\n%s", example)
}

返回結果如下:

2023/07/02 23:29:57 Go's time.After example:
package main
import (
        "fmt"
        "time"
)
var c chan int
func handle(int) {}
func main() {
        select {
        case m := <-c:
                handle(m)
        case <-time.After(10 * time.Second):
                fmt.Println("timed out")
        }
}

完全模擬了一個瀏覽器的點擊請求,返回我們想要的內容。當然也可以用來種植 cookie,

// Command cookie is a chromedp example demonstrating how to set a HTTP cookie
// on requests.
package main
import (
  "context"
  "encoding/json"
  "flag"
  "fmt"
  "log"
  "net/http"
  "time"
  "github.com/chromedp/cdproto/cdp"
  "github.com/chromedp/cdproto/network"
  "github.com/chromedp/cdproto/storage"
  "github.com/chromedp/chromedp"
)
func main() {
  port := flag.Int("port", 8544, "port")
  flag.Parse()
  // start cookie server
  go cookieServer(fmt.Sprintf(":%d", *port))
  // create context
  ctx, cancel := chromedp.NewContext(context.Background())
  defer cancel()
  // run task list
  var res string
  err := chromedp.Run(ctx, setcookies(
    fmt.Sprintf("http://localhost:%d", *port), &res,
    "cookie1", "value1",
    "cookie2", "value2",
  ))
  if err != nil {
    log.Fatal(err)
  }
  log.Printf("chrome received cookies: %s", res)
}
// cookieServer creates a simple HTTP server that logs any passed cookies.
func cookieServer(addr string) error {
  mux := http.NewServeMux()
  mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
    cookies := req.Cookies()
    for i, cookie := range cookies {
      log.Printf("from %s, server received cookie %d: %v", req.RemoteAddr, i, cookie)
    }
    buf, err := json.MarshalIndent(req.Cookies(), "", "  ")
    if err != nil {
      http.Error(res, err.Error(), http.StatusInternalServerError)
      return
    }
    fmt.Fprintf(res, indexHTML, string(buf))
  })
  return http.ListenAndServe(addr, mux)
}
// setcookies returns a task to navigate to a host with the passed cookies set
// on the network request.
func setcookies(host string, res *string, cookies ...string) chromedp.Tasks {
  if len(cookies)%2 != 0 {
    panic("length of cookies must be divisible by 2")
  }
  return chromedp.Tasks{
    chromedp.ActionFunc(func(ctx context.Context) error {
      // create cookie expiration
      expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
      // add cookies to chrome
      for i := 0; i < len(cookies); i += 2 {
        err := network.SetCookie(cookies[i], cookies[i+1]).
          WithExpires(&expr).
          WithDomain("localhost").
          WithHTTPOnly(true).
          Do(ctx)
        if err != nil {
          return err
        }
      }
      return nil
    }),
    // navigate to site
    chromedp.Navigate(host),
    // read the returned values
    chromedp.Text(`#result`, res, chromedp.ByID, chromedp.NodeVisible),
    // read network values
    chromedp.ActionFunc(func(ctx context.Context) error {
      cookies, err := storage.GetCookies().Do(ctx)
      if err != nil {
        return err
      }
      for i, cookie := range cookies {
        log.Printf("chrome cookie %d: %+v", i, cookie)
      }
      return nil
    }),
  }
}
const (
  indexHTML = `<!doctype html>
<html>
<body>
  <div>%s</div>
</body>
</html>`
)
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/PYJF-N-a9L_NIvYMPZ-SQQ