1+ package main
2+
3+ import (
4+ "encoding/json"
5+ "fmt"
6+ "io/ioutil"
7+ "net/http"
8+ "regexp"
9+ "strings"
10+ )
11+
12+ // Crawl posts from CSDN
13+
14+ const (
15+ ListPostURL = "https://blog.csdn.net/%s/article/list/%d?"
16+ PostDetailURL = "https://mp.csdn.net/mdeditor/getArticle?id=%s"
17+ )
18+
19+ type DetailData struct {
20+ Data PostDetail `json:"data"`
21+ }
22+
23+ type PostDetail struct {
24+ Title string `json:"title"`
25+ Description string `json:"description"`
26+ Markdowncontent string `json:"markdowncontent"`
27+ Tags string `json:"tags"`
28+ }
29+
30+ func GetPageSize (username string ) (int , error ) {
31+ client := http.Client {}
32+
33+ resp , err := client .Get (fmt .Sprintf (ListPostURL , username , 1 ))
34+ if err != nil {
35+ return 0 ,err
36+ }
37+
38+ data , err := ioutil .ReadAll (resp .Body )
39+
40+ r := regexp .MustCompile (`class="ui-pager">.*?</li>` )
41+ finds := r .FindAll (data , - 1 )
42+
43+ for _ ,f := range finds {
44+ ss := strings .Split (string (f ), `<` )
45+ fmt .Println (ss )
46+ }
47+
48+ return 0 , nil
49+ }
50+
51+ // Crawl posts by username
52+ func CrawlPosts (username string , page int ) ([]string , error ) {
53+ client := http.Client {}
54+
55+ resp , err := client .Get (fmt .Sprintf (ListPostURL , username , page ))
56+ if err != nil {
57+ return nil ,err
58+ }
59+
60+ data , err := ioutil .ReadAll (resp .Body )
61+
62+ r := regexp .MustCompile (`<h4 class="">\s*<a href=".*?"` )
63+ finds := r .FindAll (data , - 1 )
64+
65+ var urls []string
66+
67+ for _ ,f := range finds {
68+ ss := strings .Split (string (f ), `"` )
69+ if len (ss ) >= 4 {
70+ urls = append (urls , ss [3 ])
71+ }
72+ }
73+
74+ return urls ,err
75+ }
76+
77+ func CrawlPostMarkdown (url string ) (* PostDetail , error ){
78+
79+ index := strings .LastIndex (url , "/" )
80+ id := url [index + 1 :]
81+
82+ client := http.Client {}
83+
84+ req , _ := http .NewRequest ("GET" , fmt .Sprintf (PostDetailURL , id ), nil )
85+ req .Header .Set ("cookie" ,"uuid_tt_dd=10_33227520360-1562155374449-785950; UN=junmoxi; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_33227520360-1562155374449-785950!5744*1*junmoxi!1788*1*PC_VC; smidV2=20190705154448794d4aea42482882ccb01b435d4655850093278d5d0bb12e0; OUTFOX_SEARCH_USER_ID_NCOO=1275289703.8182168; dc_session_id=10_1565764323161.169173; UserName=junmoxi; UserInfo=de709e85392f4b8a8d19d69eb2273c56; UserToken=de709e85392f4b8a8d19d69eb2273c56; UserNick=java%E6%B4%BE%E5%A4%A7%E6%98%9F; AU=B09; BT=1567597499382; p_uid=U000000; notice=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1569480050,1569545487,1569720826,1569734799; Hm_lpvt_6bcd52f51e9b3dce32bec4a3" )
86+
87+ resp , err := client .Do (req )
88+ if err != nil {
89+ return nil , err
90+ }
91+
92+ data , err := ioutil .ReadAll (resp .Body )
93+ if err != nil {
94+ return nil , err
95+ }
96+
97+ detail := new (DetailData )
98+ err = json .Unmarshal (data , detail )
99+ if err != nil {
100+ return nil , err
101+ }
102+ fmt .Println (string (data ))
103+
104+ fmt .Printf ("%+v \n " , detail )
105+
106+ return nil , nil
107+ }
108+
109+ func main () {
110+ //urls, err := CrawlPosts("junmoxi", 1)
111+ //if err != nil {
112+ // panic(err)
113+ //}
114+ //
115+ //for _,url := range urls{
116+ // fmt.Print(url)
117+ //}
118+
119+ CrawlPostMarkdown ("https://blog.csdn.net/junmoxi/article/details/101631412" )
120+
121+ // GetPageSize("junmoxi")
122+ }
0 commit comments