小说的网址是:https://www.ptwxz.com/
由于是爬取小说,是要按顺序把小说爬下载生成txt文件,所以暂时想不到怎么并发然后在顺序的追加到文件里,就单任务一章一章的爬取
就遇到了网页编码的问题,和字符串删除的问题
网页编码的问题直接用 github.com/fesiong/goproject/convert 这个包来解决
标题我就直接用goquer来抓取
package main
import (
"bufio"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/fesiong/goproject/convert"
"log"
"os"
"regexp"
"strings"
)
const urlRe = `\d{5,}.html`
const contentRe = `(?U:<br>[\s\S]+</div>)`
type Url string
var booktitle string
func main() {
var url Url
url=Url(os.Args[1])
html:=Fetch(string(url))
childurls:=url.ParseUrl(html)
for _, childurl := range childurls {
html:=Fetch(childurl)
Parsecontent(html)
}
}
func Fetch(url string) string {
res, err := convert.Request(url, &convert.Options{})
res.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
if err != nil {
log.Fatal(err)
}
return res.Body
}
func (url Url)ParseUrl(html string) []string{
booktitle = selectTitle(html)
var childurls []string
rep := regexp.MustCompile(urlRe) //匹配正则
match := rep.FindAllStringSubmatch(html, -1) //返回字符串切片,-1表示全部,1表示匹配1个,2就匹配2个
for _, m := range match {
fmt.Println(string(url)+m[0])
childurls = append(childurls,string(url)+m[0])
}
return childurls
}
func Parsecontent(html string){
rep := regexp.MustCompile(contentRe) //匹配正则
match := rep.FindAllStringSubmatch(html, -1) //返回字符串切片,-1表示全部,1表示匹配1个,2就匹配2个
title := selectTitle(html)
var body string
for _, m := range match {
m[0] = strings.ReplaceAll(m[0], " ", "")
m[0] = strings.ReplaceAll(m[0], "<br /><br />", "\n")
m[0] = strings.ReplaceAll(m[0], "<br>", "")
m[0] = strings.ReplaceAll(m[0], "</div>", "")
body = m[0]
//fmt.Println(body)
}
writefile(title + "\n" + body + "\n")
fmt.Println("正在爬取: " + title)
}
func writefile(f string) { //创建list.txt文件再写入
booktitle=strings.ReplaceAll(booktitle, "最新章节", "")
filePath := booktitle+".txt"
file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Println("open file err", err)
}
//及时关闭file句柄
defer file.Close()
//写入文件时,使用带缓存的 *Writer
write := bufio.NewWriter(file)
write.WriteString(f)
//Flush将缓存的文件真正写入到文件中
write.Flush()
}
func selectTitle(html string) string {
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
var title string
dom.Find("h1").Each(func(i int, selection *goquery.Selection) {
title = selection.Text()
})
return title
}
本来想用goquery来爬取小说的内容,但是查看网页源代码发现正文没有对应的标签,所以只能用正则爬取

爬取到这些,用strings.ReplaceAll来消除没用的字符
#运行
go run main.go https://www.ptwxz.com/html/9/9465/