转载请标明出处:
http://blog.csdn.net/qq_27818541/article/details/111936188
本文出自:【BigManing的博客】
示例介绍
- 首先访问入口
主URL
- 提取
课程分类URL
- 访问
课程分类URL
,获取课程详情信息
示例代码
package main
import (
"encoding/json"
"log"
"os"
"strings"
"github.com/gocolly/colly/v2"
)
// Course stores information about a coursera course
type Course struct {
Title string
Description string
Creator string
Level string
URL string
Language string
Commitment string
HowToPass string
Rating string
}
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
colly.AllowedDomains("coursera.org", "www.coursera.org"),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./coursera_cache"),
)
// Create another collector to scrape course details
detailCollector := c.Clone()
courses := make([]Course, 0, 200)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" {
return
}
link := e.Attr("href")
// If link start with browse or includes either signup or login return from callback
if !strings.HasPrefix(link, "/browse") || strings.Index(link, "=signup") > -1 || strings.Index(link, "=login") > -1 {
return
}
// start scaping the page under the link found
e.Request.Visit(link)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})
// On every a HTML element which has name attribute call callback
c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
detailCollector.Visit(courseURL)
}
})
// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".course-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
course := Course{
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
Creator: e.ChildText("div.creator-names > span"),
}
// Iterate over rows of the table which contains different information
// about the course
e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
switch el.ChildText("td:first-child") {
case "Language":
course.Language = el.ChildText("td:nth-child(2)")
case "Level":
course.Level = el.ChildText("td:nth-child(2)")
case "Commitment":
course.Commitment = el.ChildText("td:nth-child(2)")
case "How To Pass":
course.HowToPass = el.ChildText("td:nth-child(2)")
case "User Ratings":
course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
}
})
courses = append(courses, course)
})
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
// Dump json to the standard output
enc.Encode(courses)
}
输出结果
ps:输出结果为[] ,究其原因可能是原有界面有变动。我们只要重点关注其中内部实现即可
2020/12/29 20:15:39 visiting https://coursera.org/browse
2020/12/29 20:15:43 visiting https://www.coursera.org/browse
2020/12/29 20:15:45 visiting https://www.coursera.org/browse/math-and-logic
2020/12/29 20:15:48 visiting https://www.coursera.org/browse/arts-and-humanities
2020/12/29 20:15:50 visiting https://www.coursera.org/browse/arts-and-humanities/history
2020/12/29 20:15:53 visiting https://www.coursera.org/browse/arts-and-humanities/music-and-art
2020/12/29 20:15:56 visiting https://www.coursera.org/browse/arts-and-humanities/philosophy
2020/12/29 20:16:01 visiting https://www.coursera.org/browse/business
2020/12/29 20:16:03 visiting https://www.coursera.org/browse/business/leadership-and-management
2020/12/29 20:16:07 visiting https://www.coursera.org/browse/business/finance
2020/12/29 20:16:12 visiting https://www.coursera.org/browse/business/marketing
2020/12/29 20:16:15 visiting https://www.coursera.org/browse/business/entrepreneurship
2020/12/29 20:16:19 visiting https://www.coursera.org/browse/business/business-essentials
2020/12/29 20:16:20 visiting https://www.coursera.org/browse/business/business-strategy
2020/12/29 20:16:24 visiting https://www.coursera.org/browse/computer-science
2020/12/29 20:16:25 visiting https://www.coursera.org/browse/computer-science/software-development
2020/12/29 20:16:26 visiting https://www.coursera.org/browse/computer-science/mobile-and-web-development
2020/12/29 20:16:30 visiting https://www.coursera.org/browse/computer-science/algorithms
2020/12/29 20:16:33 visiting https://www.coursera.org/browse/computer-science/computer-security-and-networks
2020/12/29 20:16:37 visiting https://www.coursera.org/browse/computer-science/design-and-product
2020/12/29 20:16:40 visiting https://www.coursera.org/browse/data-science
2020/12/29 20:16:43 visiting https://www.coursera.org/browse/data-science/data-analysis
2020/12/29 20:16:46 visiting https://www.coursera.org/browse/data-science/machine-learning
2020/12/29 20:16:49 visiting https://www.coursera.org/browse/data-science/probability-and-statistics
2020/12/29 20:16:52 visiting https://www.coursera.org/browse/information-technology
2020/12/29 20:16:55 visiting https://www.coursera.org/browse/information-technology/cloud-computing
2020/12/29 20:17:00 visiting https://www.coursera.org/browse/information-technology/security
2020/12/29 20:17:02 visiting https://www.coursera.org/browse/information-technology/data-management
2020/12/29 20:17:05 visiting https://www.coursera.org/browse/information-technology/networking
2020/12/29 20:17:09 visiting https://www.coursera.org/browse/information-technology/support-and-operations
2020/12/29 20:17:12 visiting https://www.coursera.org/browse/life-sciences
2020/12/29 20:17:14 visiting https://www.coursera.org/browse/health/animal-health
2020/12/29 20:17:18 visiting https://www.coursera.org/browse/health
2020/12/29 20:17:21 visiting https://www.coursera.org/browse/health/basic-science
2020/12/29 20:17:24 visiting https://www.coursera.org/browse/health/health-informatics
2020/12/29 20:17:27 visiting https://www.coursera.org/browse/health/healthcare-management
2020/12/29 20:17:30 visiting https://www.coursera.org/browse/health/nutrition
2020/12/29 20:17:32 visiting https://www.coursera.org/browse/health/patient-care
2020/12/29 20:17:35 visiting https://www.coursera.org/browse/health/public-health
2020/12/29 20:17:40 visiting https://www.coursera.org/browse/health/research
2020/12/29 20:17:43 visiting https://www.coursera.org/browse/health/psychology
2020/12/29 20:17:45 visiting https://www.coursera.org/browse/personal-development
2020/12/29 20:17:48 visiting https://www.coursera.org/browse/physical-science-and-engineering
2020/12/29 20:17:51 visiting https://www.coursera.org/browse/physical-science-and-engineering/electrical-engineering
2020/12/29 20:17:54 visiting https://www.coursera.org/browse/physical-science-and-engineering/mechanical-engineering
2020/12/29 20:17:57 visiting https://www.coursera.org/browse/physical-science-and-engineering/chemistry
2020/12/29 20:18:00 visiting https://www.coursera.org/browse/physical-science-and-engineering/environmental-science-and-sustainability
2020/12/29 20:18:02 visiting https://www.coursera.org/browse/physical-science-and-engineering/physics-and-astronomy
2020/12/29 20:18:03 visiting https://www.coursera.org/browse/physical-science-and-engineering/research-methods
2020/12/29 20:18:05 visiting https://www.coursera.org/browse/social-sciences
2020/12/29 20:18:09 visiting https://www.coursera.org/browse/social-sciences/economics
2020/12/29 20:18:11 visiting https://www.coursera.org/browse/social-sciences/education
2020/12/29 20:18:14 visiting https://www.coursera.org/browse/social-sciences/governance-and-society
2020/12/29 20:18:15 visiting https://www.coursera.org/browse/social-sciences/law
2020/12/29 20:18:18 visiting https://www.coursera.org/browse/language-learning
2020/12/29 20:18:21 visiting https://www.coursera.org/browse/language-learning/learning-english
2020/12/29 20:18:24 visiting https://www.coursera.org/browse/language-learning/other-languages
[]