diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 68f8a54..11484be 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,22 +1,45 @@ - + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/config/config.go b/config/config.go index 10c38bc..97cbb7b 100755 --- a/config/config.go +++ b/config/config.go @@ -8,7 +8,10 @@ var ( ContentOptions string // 显示版本号 Version bool + // cookie 内容或者cookie文件地址 + Cookie string ) + const VideoPPT="https://www.icourses.cn/web/sword/portal/shareChapter?cid=" const Assignments="http://www.icourses.cn/web/sword/portal/assignments?cid=" const TestPaper="http://www.icourses.cn/web/sword/portal/testPaper?cid=" diff --git a/download/download.go b/download/download.go index 7ba2681..a68da31 100755 --- a/download/download.go +++ b/download/download.go @@ -16,13 +16,71 @@ import ( //批量下载file数组里的文件 -func DownloadFiles(files []utils.File){ +func DownloadFiles(files []utils.File,courseName string){ for _,file:=range(files){ //下载到指定文件夹 - fmt.Println("\n"+filepath.Join(config.OutputPath,file.FilePATH)) - DownloadFile(file.FileURL,filepath.Join(config.OutputPath,file.FilePATH)) + fmt.Println("\n"+filepath.Join(config.OutputPath,courseName,file.FilePATH)) + DownloadFile(file.FileURL,filepath.Join(config.OutputPath,courseName,file.FilePATH)) } } + +//使用cookie下载file文件 +func DownloadCookieFiles(files []utils.File,courseName string,cookie string){ + for _,file:=range(files){ + //下载到指定文件夹 + fmt.Println("\n"+filepath.Join(config.OutputPath,courseName,file.FilePATH)) + DownloadCookieFile(file.FileURL,filepath.Join(config.OutputPath,courseName,file.FilePATH), cookie) + } +} + +//使用cookie下载单个文件 +func DownloadCookieFile(url string,filePath string,cookie string){ + //fmt.Println(url) + //获取需要下载的文件大小 + dataSize:=getFileSize(url) + //获取需要写入的信息 + client := &http.Client{} + reqest, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Fatal(err) + } + reqest.Header.Set("Cookie",cookie) + res, _ := client.Do(reqest) + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + + //检查文件是否存在 + if utils.FileExists(filePath){ + fmt.Printf("file already exists, skipping\n") + return + } + //检查目录是否存在 + if _, err := os.Stat(filepath.Dir(filePath)); os.IsNotExist(err) { + //建立目录 + _ = os.MkdirAll(filepath.Dir(filePath), os.ModePerm) + } + //建立进程条,设置参数显示下载速度和下载进度 + bar := progressbar.NewOptions( + int(dataSize), + progressbar.OptionSetBytes(int(dataSize)), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + ) + + // 创建文件 + dest, err := os.Create(filePath) + if err != nil { + fmt.Printf("Can't create %s: %v\n", filePath, err) + return + } + defer dest.Close() + // 从reader读入文件 + out := io.MultiWriter(dest, bar) + _, _ = io.Copy(out, res.Body) +} + //根据网络URL获得文件的大小 func getFileSize(url string) int64 { res, err := http.Head(url) @@ -77,3 +135,4 @@ func DownloadFile(url string,filePath string){ out := io.MultiWriter(dest, bar) _, _ = io.Copy(out, res.Body) } + diff --git a/icourse b/icourse index 55236bf..a732c31 100755 Binary files a/icourse and b/icourse differ diff --git a/icourse-darwin b/icourse-darwin index 7d036cf..a4f5a0d 100755 Binary files a/icourse-darwin and b/icourse-darwin differ diff --git a/icourse.exe b/icourse.exe index cbe3b44..8c11b32 100755 Binary files a/icourse.exe and b/icourse.exe differ diff --git a/main.go b/main.go index 36a934c..ea9d5e6 100755 --- a/main.go +++ b/main.go @@ -16,40 +16,30 @@ import ( func init() { flag.BoolVar(&config.Version, "v", false, "Show version") //all为全部下载,most为视频课件以及试卷,也为下载默认选项,videoPPT仅下载视频和课件,exams为仅下载试卷,resources仅下载其它资源 - flag.StringVar(&config.ContentOptions, "c", "most", "Specify the download content {all,most,videoPPT,assignments,testPaper,shareResource}") + flag.StringVar(&config.ContentOptions, "co", "all", "Only for icourse : Specify the download content {all,most,videoPPT,assignments,testPaper,shareResource}\nOnly for chinesemooc : Specify the download content {all, video , PPT}") + //华文慕课的下载选项,只有三个:全部下载,只下载视频 以及 只下载课件 + + //设置下载路径 flag.StringVar(&config.OutputPath, "o", "", "Specify the output path") - //flag.StringVar(&config.StartUrl, "F", "", "course URL") + //设置cookie + flag.StringVar(&config.Cookie, "c", "", "Cookie or the path of Cookie file") } -func download(url string,options string) bool{ - id:=utils.MatchAll(url,`course_([0-9]*)`) - if id != nil{ - //得到课程的id地址 - idNum:=id[0][1] - //fmt.Println(idNum) - switch options{ - case "all": - parser.DownloadAll(idNum) - case "most": - parser.DownloadMost(idNum) - case "videoPPT": - parser.DownloadVideoPPT(idNum) - case "assignments": - parser.DownloadAssignments(idNum) - case "testPaper": - parser.DownloadTestPaper(idNum) - case "shareResource": - parser.DownloadShareResource(idNum) - } - } else{ - //网址不符合格式 - fmt.Printf("this website %s is not supported now",url) - return true + +func download(url string) bool { + domain := utils.Domain(url) + switch domain { + case "icourses": + parser.DownloadIcourse(url,config.ContentOptions) + case "chinesemooc": + //fmt.Println("SUCCESS") + parser.DownloadChinesemooc(url,config.ContentOptions,config.Cookie) } - return true + return true } func main() { //此处参考了annie的代码 + //fmt.Println(parser.GetStartURLs("http://www.chinesemooc.org/mooc/4880","pku_auth=161evS%2BQJtmq%2FGJRyU%2BFhfaNLyG88SrUPqUX5a0eOUW49JVtBaPxY7lt1vp2MvvcC9UaH8qYx3%2B0cSja0MeVNCmDSWRQ; pku_loginuser=univeroner%40gmail.com; pku_reward_log=daylogin%2C1173273; Hm_lvt_ff4f6e9862a4e0e16fd1f5a7f6f8953b=1569321857,1569494843,1569494850,1569759380; PHPSESSID=p72d5gqftbmp65mmr2n9ghrah5; pku__refer=%252Fmooc%252F4880; Hm_lpvt_ff4f6e9862a4e0e16fd1f5a7f6f8953b=1569761588")) flag.Parse() args := flag.Args() @@ -63,10 +53,14 @@ func main() { flag.PrintDefaults() return } + if config.Cookie != ""{ + utils.ReadCookieFromFile(config.Cookie) + //fmt.Println(config.Cookie) + } var isErr bool //可以下载多个url for _, videoURL := range args { - if err := download(strings.TrimSpace(videoURL),config.ContentOptions); err { + if err := download(strings.TrimSpace(videoURL)); err { isErr = true } } diff --git a/parser/chinesemooc.go b/parser/chinesemooc.go new file mode 100644 index 0000000..8bd3f0f --- /dev/null +++ b/parser/chinesemooc.go @@ -0,0 +1,176 @@ +package parser + +import ( + "fmt" + "github.com/PuerkitoBio/goquery" + "icourse/download" + "icourse/utils" + "path/filepath" + "regexp" + "strconv" + "strings" +) + +// 华文慕课下载部分 +func DownloadChinesemooc(url string,options string,cookie string){ + courseName := GetChinesemoocName(url) + fmt.Println(courseName) + switch options{ + case "all": + DownloadCMAll(url,courseName,cookie) + case "video": + DownloadCMVideos(url,courseName,cookie) + case "PPT": + DownloadCMPPTs(url,courseName,cookie) + } + +} + +//下载视频和课件 +func DownloadCMAll(url string,courseName,cookie string){ + DownloadCMVideos(url,courseName,cookie) + DownloadCMPPTs(url,courseName,cookie) +} + +//下载所有视频 +func DownloadCMVideos(url string,courseName,cookie string){ + videoURL,_ := GetStartURLs(url,cookie) + //fmt.Println(videoURL) + files := ExtractCMVideo(videoURL,cookie) + //fmt.Println(files) + //可以直接下载 + download.DownloadCookieFiles(files,courseName,cookie) +} +//下载所有课件 +func DownloadCMPPTs(url string,courseName,cookie string){ + _,PPTURL := GetStartURLs(url,cookie) + //fmt.Println(PPTURL) + files := ExtractCMPPTs(PPTURL,cookie) + //使用cookie进行下载 + download.DownloadCookieFiles(files,courseName,cookie) +} + + +//得到课程名称 +func GetChinesemoocName(url string)string { + data := utils.HttpGet(url) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + utils.Check(err) + courseName := utils.Format(doc.Find("head > title").Text()) + return courseName +} + +func getVideoURL(url string,cookies string) string{ + data:=utils.HttpGetCookie(url,cookies) + //设置长度,精确匹配 + // 可以改进为自动匹配最高清晰度,,此处默认标清 + //fmt.Println(data) + SDvideo := utils.MatchAll(string(data),`http.{100,200}SD.mp4`) + // + videoUrl := "" + + //如果能选择清晰度的话选择高清 + if SDvideo != nil{ + videoUrl = SDvideo[0][0] + } else { //只有一种清晰度的情况 + videoUrl = utils.MatchAll(string(data),"http.{100,200}.mp4")[0][0] + } + //fmt.Println(videoUrl) + re := regexp.MustCompile(`\\`) + videoUrl = re.ReplaceAllString(videoUrl,"") + return videoUrl +} + +//返回待下载的视频文件列表 +func ExtractCMVideo(url string,cookies string) []utils.File{ + //需要下载的文件集合 + var files []utils.File + //把cookie加在文件里 + data:=utils.HttpGetCookie(url,cookies) + //fmt.Println(data) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + utils.Check(err) + doc.Find("#coursefile > div.file-lists > div").Each(func(i int, s *goquery.Selection) { + chapterName := s.Find(".main-item >span").Text() + //fmt.Println(chapterName) + + s.Find("div.item-detail > ul > li.light.clearfix").Each(func(i int, p *goquery.Selection) { + //小节的标题 + contentName := p.Find(".course-name >span").Text() + //fmt.Println(contentName) + //每一小节可能有多个视频 + p.Find(".icon-spow-wrap > .video").Each(func(j int, q *goquery.Selection) { + + filename,_ := q.Attr("original-title") + str,_ := q.Attr("href") + filePath := filepath.Join(chapterName,contentName,strconv.Itoa(j+1)+filename+".mp4") + //提取两个id + ID := utils.MatchAll(str,`&id=([0-9]*)`)[0][1] + eid := utils.MatchAll(str,`eid=([0-9]*)`)[0][1] + URL := "http://www.chinesemooc.org/api/course_video_watch.php?course_id="+ID+"&eid="+eid + videoURL := getVideoURL(URL,cookies) + //fmt.Println(filePath) + files = append(files, utils.File{videoURL,filePath}) + }) + + }) + }) + return files +} + +//返回待下载的课件文件列表 +func ExtractCMPPTs(url string,cookies string) []utils.File{ + //需要下载的文件集合 + var files []utils.File + //把cookie加在文件里 + data:=utils.HttpGetCookie(url,cookies) + //fmt.Println(data) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + utils.Check(err) + doc.Find("#coursefile > ul > li").Each(func(i int, s *goquery.Selection) { + chapterName := utils.Format(s.Find("div.title.clearfix").Text()) + //fmt.Println(chapterName) + //每个章节底下的小节 + s.Find(".download-list").Each(func(i int, p *goquery.Selection){ + //提取小节名 + contentName := p.Find(".download-list-tit").Text() + p.Find("ul > li.download-list-num > a").Text() + //fmt.Println(contentName) + filePath := filepath.Join(chapterName,contentName); + //fmt.Println(filePath) + str,_ := p.Find("span[onclick].download-load").Attr("onclick"); + //fmt.Println(str) + //提取pdf下载地址 + URL := utils.MatchAll(str,`window.open\("(.*)",`)[0][1] + //如果是相对路径 + if(utils.MatchAll(URL,`http`) == nil){ + URL = "http://www.chinesemooc.org/" +URL + } + //baseURL := "http://www.chinesemooc.org/" + //fmt.Println(URL) + files = append(files,utils.File{URL,filePath}) + }) + }) + //注意下载的时候需要cookie + return files +} + +//从起始地址得到视频和课件的下载页面 +func GetStartURLs(url string, cookies string) (string,string) { + data:=utils.HttpGetCookie(url,cookies) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + utils.Check(err) + s := doc.Find("#top-select > div > div > button") + str,_ := s.Attr("onclick") + //fmt.Println(str) + //匹配classesid + //匹配courseid + classID := utils.MatchAll(str,`([0-9]*),`)[0][1] + courseID := utils.MatchAll(str,`,([0-9]*)`)[0][1] + //视频页面url + courseProgress := "http://www.chinesemooc.org/kvideo.php?do=course_progress&kvideoid=" + classID + "&classesid=" + courseID + //fmt.Println(courseProgress) + //课件页面url + courseCware :="http://www.chinesemooc.org/kvideo.php?do=course_cware_list&kvideoid=" + classID + "&classesid=" + courseID + return courseProgress,courseCware +} + diff --git a/parser/parser.go b/parser/icourse.go similarity index 66% rename from parser/parser.go rename to parser/icourse.go index 118d9aa..92badbd 100755 --- a/parser/parser.go +++ b/parser/icourse.go @@ -1,6 +1,7 @@ package parser import ( + "fmt" "github.com/PuerkitoBio/goquery" "icourse/config" "icourse/download" @@ -14,37 +15,79 @@ import ( ) //此处放置部分页面特有大的处理函数 +//总的下载函数 +func DownloadIcourse(url string,options string) bool{ + courseName := utils.Format(GetIcouseName(url)) + id:=utils.MatchAll(url,`course_([0-9]*)`) + + if id != nil{ + //得到课程的id地址 + idNum:=id[0][1] + //fmt.Println(idNum) + switch options{ + case "all": + DownloadAll(idNum,courseName) + case "most": + DownloadMost(idNum,courseName) + case "videoPPT": + DownloadVideoPPT(idNum,courseName) + case "assignments": + DownloadAssignments(idNum,courseName) + case "testPaper": + DownloadTestPaper(idNum,courseName) + case "shareResource": + DownloadShareResource(idNum,courseName) + } + } else{ + //网址不符合格式 + fmt.Printf("this website %s is not supported now",url) + return true + } + return true +} + + +//获取课程名称,作为课程下载的目录 +func GetIcouseName(url string)string{ + data:=utils.HttpGet(url) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + utils.Check(err) + courseName := doc.Find("#introduction-body > section.container > section > section > div.course-introduction-infor.pull-left > div.course-title.clearfix").Text() + return courseName +} //不同下载选项的具体实现 -func DownloadAll(id string){ - DownloadMost(id) - DownloadShareResource(id) +func DownloadAll(id string,courseName string){ + DownloadMost(id,courseName) + DownloadShareResource(id,courseName) } -func DownloadMost(id string){ - DownloadVideoPPT(id) - DownloadAssignments(id) - DownloadTestPaper(id) +func DownloadMost(id string,courseName string){ + DownloadVideoPPT(id,courseName) + DownloadAssignments(id,courseName) + DownloadTestPaper(id,courseName) } -func DownloadVideoPPT(id string){ +func DownloadVideoPPT(id string,courseName string){ s:=config.VideoPPT+id files:=extractURLs(s) - download.DownloadFiles(files) + download.DownloadFiles(files,courseName) } -func DownloadAssignments(id string){ +func DownloadAssignments(id string,courseName string){ s:=config.Assignments+id files:=extractURLs(s) - download.DownloadFiles(files) + download.DownloadFiles(files,courseName) } -func DownloadTestPaper(id string){ +func DownloadTestPaper(id string,courseName string){ s:=config.TestPaper+id files:=extractURLs(s) - download.DownloadFiles(files) + download.DownloadFiles(files,courseName) } -func DownloadShareResource(id string){ +func DownloadShareResource(id string,courseName string){ s :=config.ShareResource+id files:=extractOthers(s) - download.DownloadFiles(files) + download.DownloadFiles(files,courseName) } + + //根据sectionID,构造post请求,parentPath为前面的路径名,返回文件的数组 func getVideo(id string,parentPath string) []utils.File { var files []utils.File diff --git a/utils/utils.go b/utils/utils.go index ac8be9a..372dae4 100755 --- a/utils/utils.go +++ b/utils/utils.go @@ -43,7 +43,7 @@ func FileExists(filePath string) bool{ //去除字符串里的空格换行等 func Format(str string)string{ - re := regexp.MustCompile("[\r\n\t]") + re := regexp.MustCompile("[\r\n\t ]") res := re.ReplaceAllString(str, "") return res } @@ -74,6 +74,21 @@ func MatchAll(text, pattern string) [][]string { return value } +//获取url链接的域名 +// Domain get the domain of given URL +func Domain(url string) string { + domainPattern := `([a-z0-9][-a-z0-9]{0,62})\.` + + `(com\.cn|com\.hk|` + + `cn|com|net|edu|gov|biz|org|info|pro|name|xxx|xyz|be|` + + `me|top|cc|tv|tt)` + domain := MatchOneOf(url, domainPattern) + if domain != nil { + return domain[1] + } + return "Universal" +} + + //根据url,构造get请求 func HttpGet(s string) string { res, err := http.Get(s) @@ -86,4 +101,33 @@ func HttpGet(s string) string { } body, err := ioutil.ReadAll(res.Body) return string(body) +} + +//根据url以及cookie构造get请求 +func HttpGetCookie(s string,cookie string) string { + client := &http.Client{} + reqest, err := http.NewRequest("GET", s, nil) + if err != nil { + log.Fatal(err) + } + reqest.Header.Set("Cookie",cookie) + res, _ := client.Do(reqest) + defer res.Body.Close() + if res.StatusCode != 200 { + log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) + } + body, err := ioutil.ReadAll(res.Body) + //fmt.Println(string(body)) + return string(body) +} + +//从文件中读取cookie +func ReadCookieFromFile(filePath string){ + //如果cookie是一个文件并且存在 + if _, fileErr := os.Stat(config.Cookie); fileErr == nil { + // Cookie is a file + data, err := ioutil.ReadFile(config.Cookie) + Check(err) + config.Cookie = string(data) + } } \ No newline at end of file