上次的例子是一章一章的爬取,想并发但是想不出怎么又并发又顺序的写入文件,只能取个巧,先把每章单独编号写到单独的txt文件里,再按顺序合并到单一的txt里
package main
import (
"bufio"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/fesiong/goproject/convert"
"io/ioutil"
"log"
"os"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
const urlRe = `\d{7,}.html`
const contentRe = `(?U:<br>[\s\S]+</div>)`
type Book struct {
BookUrl string
BookTitle string
Chapters []Chapter
}
type Chapter struct {
Id int
ChapterUrl string
}
func main() {
book:=&Book{}
book.BookUrl="https://www.ptwxz.com/html/10/10043/"
book.ParseChapters(Fetch(book.BookUrl))
}
func Fetch(url string) string {
res, err := convert.Request(url, &convert.Options{})
res.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
if err != nil {
log.Fatal(err)
}
return res.Body
}
func (book *Book) ParseChapters(html string,){
start := time.Now()
book.BookTitle = selectTitle(html)
book.BookTitle=strings.ReplaceAll(book.BookTitle, "最新章节", "")
rep := regexp.MustCompile(urlRe) //匹配正则
match := rep.FindAllStringSubmatch(html, -1) //返回字符串切片,-1表示全部,1表示匹配1个,2就匹配2个
for k, v := range match {
book.Chapters=append(book.Chapters,Chapter{
Id: k,
ChapterUrl: book.BookUrl+v[0],
})
}
wg:=&sync.WaitGroup{}
//mu:=&sync.Mutex{}
jobchan := make(chan *Chapter)
go createWork(20,jobchan,wg)
for _, m := range book.Chapters {
c:=m//这里一定要重新赋值,要不然channel里是m的地址,导致多个协程里channel都是一样的
jobchan <- &c
wg.Add(1)
}
wg.Wait()
combinefiles(book.BookTitle,len(book.Chapters))
fmt.Printf("time:%.2fs\n",time.Since(start).Seconds())
}
func Parsecontent(html string) (string){
rep := regexp.MustCompile(contentRe) //匹配正则
match := rep.FindString(html) //返回字符串切片
title := selectTitle(html)
match = strings.ReplaceAll(match, " ", "")
match = strings.ReplaceAll(match, "<br />", "\n")
match = strings.ReplaceAll(match, "<br>", "")
match = strings.ReplaceAll(match, "\n\n", "\n")
match = strings.ReplaceAll(match, "</div>", "")
fmt.Println("craw:"+title)
return title+"\n"+match
}
func writefile(f string,title string) { //创建txt文件再写入
filePath := title+".txt"
file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE, 0666)
if err != nil {
fmt.Println("open file err", err)
}
//写入文件时,使用带缓存的 *Writer
write := bufio.NewWriter(file)
_,err=write.WriteString(f)
if err != nil {
fmt.Println(err.Error())
}
//Flush将缓存的文件真正写入到文件中
err=write.Flush()
if err != nil {
fmt.Println(err.Error())
}
defer file.Close()
}
func selectTitle(html string) string {
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
var title string
dom.Find("h1").Each(func(i int, selection *goquery.Selection) {
title = selection.Text()
})
return title
}
func createWork(num int, jobChan chan *Chapter, wg *sync.WaitGroup) {
// 根据开协程个数,去跑运行
for i := 0; i < num; i++ {
go func(jobChan chan *Chapter, wg *sync.WaitGroup) {
//mu.Lock()
for job := range jobChan {
contents:= Parsecontent(Fetch(job.ChapterUrl)) //开始爬取对应的页面
writefile(contents,strconv.Itoa(job.Id+1))
fmt.Printf("正在写:%d.txt\n",job.Id+1)
wg.Done()
}
//mu.Unlock()
}(jobChan, wg)
}
}
func combinefiles(destination string,lens int){
filePath := destination+".txt"
file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Println("open file err", err)
}
//写入文件时,使用带缓存的 *Writer
write := bufio.NewWriter(file)
for i := 0; i <lens; i++ {
sourcepath:= strconv.Itoa(i+1)+".txt"
content ,err :=ioutil.ReadFile(sourcepath)//小说不大,读取到内存就好了
if err !=nil {
panic(err)
}
_,err=write.WriteString(string(content))
if err != nil {
fmt.Println(err.Error())
}
//Flush将缓存的文件真正写入到文件中
err=write.Flush()
if err != nil {
fmt.Println(err.Error())
}
err = os.Remove(sourcepath)
if err != nil {
fmt.Println("file remove Error!")
}
}
defer file.Close()
}