Go 實戰 -四-: 基金分析系統 - 爬取基金詳情
- 爬取邏輯流程
- 準備工作
2.1 分析網頁
2.2 編寫結構體
根據上面的網頁分析,可以根據div[class='txt_cont']
選中table
,然後根據第n
行第x
列,來編寫對應的結構體,如下:
// 定義結構體對應
type BasisCrawl struct {
// tr:nth-child(n) > td:nth-of-type(x): 代表第n行第x列
Code string `selector:"tr:nth-child(2) > td:nth-of-type(1)"`
FullName string `selector:"tr:nth-child(1) > td:nth-of-type(1)"`
ShortName string `selector:"tr:nth-child(1) > td:nth-of-type(2)"`
Type string `selector:"tr:nth-child(2) > td:nth-of-type(2)"`
ReleaseDate string `selector:"tr:nth-child(3) > td:nth-of-type(1)"`
EstablishDate string `selector:"tr:nth-child(3) > td:nth-of-type(2)"`
EstablishShares string `selector:"tr:nth-child(3) > td:nth-of-type(2)"`
Company string `selector:"tr:nth-child(5) > td:nth-of-type(1)"`
Manager string `selector:"tr:nth-child(6) > td:nth-of-type(1)"`
ManagerDesc string `selector:"tr:nth-child(6) > td:nth-of-type(1) > a[href]" attr:"href"`
ManageFeeRate string `selector:"tr:nth-child(7) > td:nth-of-type(1)"`
CustodyFeeRate string `selector:"tr:nth-child(7) > td:nth-of-type(2)"`
SaleFeeRate string `selector:"tr:nth-child(8) > td:nth-of-type(1)"`
Benchmark string `selector:"tr:nth-child(10) > td:nth-of-type(1)"`
}
- 請求流程預覽
- 代碼實現
4.1 批量抓取入口函數
// 批量抓取
func BatchBasicCrawl() {
// 從排行榜中獲取code,並過濾已經爬取過的code
basicFundList := dao.FilterBasicFund()
total := len(basicFundList)
if total > 0 {
var baseRowsChannel = make(chan entity.FundBasis, total)
// 分組抓取
crawlByGroup(basicFundList, baseRowsChannel)
// 遍歷channel獲取數據
var fundBasisRows []entity.FundBasis
for item := range baseRowsChannel {
fundBasisRows = append(fundBasisRows, item)
}
if fundBasisRows != nil {
// 保存入庫
create := global.GvaMysqlClient.Create(fundBasisRows)
if create.Error != nil {
global.GvaLogger.Sugar().Errorf("基金詳情入庫失敗", create.Error)
return
}
global.GvaLogger.Sugar().Infof("基金詳情抓取成功,共: %v 條", create.RowsAffected)
}
}
}
4.2 過濾有詳情 code(dao.FilterBasicFund
)
// 查詢沒有詳情的基金信息
func FilterBasicFund() []FilterBasicResult {
res := []FilterBasicResult{}
global.GvaMysqlClient.Raw("SELECT A.fund_code,B.`code` from fas_fund_day_top as A LEFT JOIN fas_fund_basis as B on A.fund_code = B.`code` WHERE B.`code` is NULL GROUP BY A.fund_code").Scan(&res)
return res
}
4.3 分組抓取函數 (crawlByGroup
)
// 分組抓取,防止併發過大,被拒絕訪問
func crawlByGroup(basicResults []dao.FilterBasicResult, c chan<- entity.FundBasis) {
// 分組抓取
groupNum := 15
fundCodeGroup := splitFundBasicList(basicResults, groupNum)
// 併發請求抓取
var wg sync.WaitGroup
wg.Add(groupNum)
for _, results := range fundCodeGroup {
basicFundList := results
go func() {
for _, item := range basicFundList {
filterBasicResult := item
f := BasisCrawl{}
// 爬取頁面信息
f.CrawlHtml(filterBasicResult.FundCode)
if f.Code != "" {
// 轉成實體類型
toEntity := f.ConvertToEntity()
c <- toEntity
}
}
wg.Done()
}()
}
wg.Wait()
// 關閉通道
close(c)
}
4.4 根據Code
爬取詳情 (CrawlHtml
)
// 抓取單個基金基本信息
func (f *BasisCrawl) CrawlHtml(fundCode string) {
collector := colly.NewCollector(colly.UserAgent(crawl.UserAgent), colly.Async(true))
collector.OnError(func(response *colly.Response, err error) {
global.GvaLogger.Sugar().Errorf("基金%s,信息獲取失敗: %s", fundCode, err)
return
})
// 基金概況
collector.OnHTML("div[class='txt_cont']", func(element *colly.HTMLElement) {
err := element.Unmarshal(f)
if err != nil {
fmt.Println("element.Unmarshal error: ", err)
}
})
// 開啓限速(不設置限速會出現請求不能正常返回)
err := collector.Limit(&colly.LimitRule{
DomainGlob: "*fundf10.eastmoney.*",
Delay: 500 * time.Millisecond,
RandomDelay: 500 * time.Millisecond,
Parallelism: 20,
})
if err != nil {
global.GvaLogger.Sugar().Errorf("設置限速失敗: %s", err)
return
}
err = collector.Visit(fmt.Sprintf("https://fundf10.eastmoney.com/jbgk_%s.html", fundCode))
if err != nil {
global.GvaLogger.Sugar().Errorf("基金%s,信息請求失敗: %s", fundCode, err)
}
collector.Wait()
}
4.5 數據清洗 (ConvertToEntity
)
func (f *BasisCrawl) ConvertToEntity() entity.FundBasis {
var fundBaseEntity entity.FundBasis
// 部分基金code解析爲: 006049(前端)、006050(後端),如:https://fundf10.eastmoney.com/jbgk_006049.html
if strings.Contains(f.Code, "、") {
f.Code = strings.Split(f.Code, "、")[0]
}
fundBaseEntity.Code = utils.ExtractNumberFromString(f.Code)
fundBaseEntity.FullName = f.FullName
fundBaseEntity.ShortName = f.ShortName
// 類型分割
typeInfo := strings.Split(f.Type, "-")
fundBaseEntity.MainType = typeInfo[0]
fundBaseEntity.SubType = typeInfo[1]
// 基金公司
fundBaseEntity.Company = f.Company
// 基金經理
fundBaseEntity.Manager = f.Manager
fundBaseEntity.ManagerDesc = strings.ReplaceAll(f.ManagerDesc,"//","")
fundBaseEntity.Benchmark = f.Benchmark
// 發佈時間
fundBaseEntity.ReleaseDate = replaceDateChinese(f.ReleaseDate)
// 成立日期
fundBaseEntity.EstablishDate = strings.TrimSpace(replaceDateChinese(strings.Split(f.EstablishDate, "/")[0]))
// 成立規模
establishShares := utils.ExtractNumberFromString(replaceDateChinese(strings.Split(f.EstablishShares, "/")[1]))
fundBaseEntity.EstablishShares, _ = strconv.ParseFloat(establishShares, 64)
// 管理費率
manageFeeRate := utils.ExtractNumberFromString(f.ManageFeeRate)
fundBaseEntity.ManageFeeRate, _ = strconv.ParseFloat(manageFeeRate, 64)
// 託管費率
fundBaseEntity.CustodyFeeRate, _ = strconv.ParseFloat(utils.ExtractNumberFromString(f.CustodyFeeRate), 64)
// 銷售服務費率
fundBaseEntity.SaleFeeRate, _ = strconv.ParseFloat(utils.ExtractNumberFromString(f.SaleFeeRate), 64)
return fundBaseEntity
}
- 註冊定時任務
5.1 實現 Job
type FundBasicCron struct {
Code string
}
// 抓取詳情信息
func (c FundBasicCron) Run() {
begin := time.Now().UnixMilli()
fmt.Println("基金詳情-定時任務開始運行")
// 開始爬取
fund.BatchBasicCrawl()
fmt.Printf("基金詳情-定時任務運行完成,耗時:%vms\n",time.Now().UnixMilli() - begin)
}
5.2 設置啓動頻率
// 添加Job任務
func addJob(c *cron.Cron) {
...
// 爬取基金基本信息(每天 22:30)
_, _ = c.AddJob("0 30 22 */1 * *", crontab.FundBasicCron{})
}
- 運行效果
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/nvRB7Xw6VoFn0XhkjuQXvw