Skip to content

Commit 14e362d

Browse files
author
kw.lei
committed
First commit
0 parents  commit 14e362d

File tree

8 files changed

+353
-0
lines changed

8 files changed

+353
-0
lines changed

.idea/csdn-hexo.iml

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

Lines changed: 203 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module csdn-hexo
2+
3+
go 1.12

hexo.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package main

main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package main

spider.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"io/ioutil"
7+
"net/http"
8+
"regexp"
9+
"strings"
10+
)
11+
12+
// Crawl posts from CSDN
13+
14+
const (
15+
ListPostURL = "https://blog.csdn.net/%s/article/list/%d?"
16+
PostDetailURL = "https://mp.csdn.net/mdeditor/getArticle?id=%s"
17+
)
18+
19+
type DetailData struct {
20+
Data PostDetail `json:"data"`
21+
}
22+
23+
type PostDetail struct {
24+
Title string `json:"title"`
25+
Description string `json:"description"`
26+
Markdowncontent string `json:"markdowncontent"`
27+
Tags string `json:"tags"`
28+
}
29+
30+
func GetPageSize(username string) (int, error) {
31+
client := http.Client{}
32+
33+
resp, err := client.Get(fmt.Sprintf(ListPostURL, username, 1))
34+
if err != nil {
35+
return 0,err
36+
}
37+
38+
data, err := ioutil.ReadAll(resp.Body)
39+
40+
r := regexp.MustCompile(`class="ui-pager">.*?</li>`)
41+
finds := r.FindAll(data, -1)
42+
43+
for _,f := range finds {
44+
ss := strings.Split(string(f), `<`)
45+
fmt.Println(ss)
46+
}
47+
48+
return 0, nil
49+
}
50+
51+
// Crawl posts by username
52+
func CrawlPosts(username string, page int) ([]string, error) {
53+
client := http.Client{}
54+
55+
resp, err := client.Get(fmt.Sprintf(ListPostURL, username, page))
56+
if err != nil {
57+
return nil,err
58+
}
59+
60+
data, err := ioutil.ReadAll(resp.Body)
61+
62+
r := regexp.MustCompile(`<h4 class="">\s*<a href=".*?"`)
63+
finds := r.FindAll(data, -1)
64+
65+
var urls []string
66+
67+
for _,f := range finds {
68+
ss := strings.Split(string(f), `"`)
69+
if len(ss) >= 4 {
70+
urls = append(urls, ss[3])
71+
}
72+
}
73+
74+
return urls,err
75+
}
76+
77+
func CrawlPostMarkdown(url string) (*PostDetail, error){
78+
79+
index := strings.LastIndex(url, "/")
80+
id := url[index+1:]
81+
82+
client := http.Client{}
83+
84+
req, _ := http.NewRequest("GET", fmt.Sprintf(PostDetailURL, id), nil)
85+
req.Header.Set("cookie","uuid_tt_dd=10_33227520360-1562155374449-785950; UN=junmoxi; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_33227520360-1562155374449-785950!5744*1*junmoxi!1788*1*PC_VC; smidV2=20190705154448794d4aea42482882ccb01b435d4655850093278d5d0bb12e0; OUTFOX_SEARCH_USER_ID_NCOO=1275289703.8182168; dc_session_id=10_1565764323161.169173; UserName=junmoxi; UserInfo=de709e85392f4b8a8d19d69eb2273c56; UserToken=de709e85392f4b8a8d19d69eb2273c56; UserNick=java%E6%B4%BE%E5%A4%A7%E6%98%9F; AU=B09; BT=1567597499382; p_uid=U000000; notice=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1569480050,1569545487,1569720826,1569734799; Hm_lpvt_6bcd52f51e9b3dce32bec4a3")
86+
87+
resp, err := client.Do(req)
88+
if err != nil {
89+
return nil, err
90+
}
91+
92+
data, err := ioutil.ReadAll(resp.Body)
93+
if err != nil {
94+
return nil, err
95+
}
96+
97+
detail := new(DetailData)
98+
err = json.Unmarshal(data, detail)
99+
if err != nil {
100+
return nil, err
101+
}
102+
fmt.Println(string(data))
103+
104+
fmt.Printf("%+v \n", detail)
105+
106+
return nil, nil
107+
}
108+
109+
func main() {
110+
//urls, err := CrawlPosts("junmoxi", 1)
111+
//if err != nil {
112+
// panic(err)
113+
//}
114+
//
115+
//for _,url := range urls{
116+
// fmt.Print(url)
117+
//}
118+
119+
CrawlPostMarkdown("https://blog.csdn.net/junmoxi/article/details/101631412")
120+
121+
// GetPageSize("junmoxi")
122+
}

0 commit comments

Comments
 (0)