欢迎光临
我们一直在努力

go语言学习例子No.32-爬取小说并发

上次的例子是一章一章的爬取,想并发但是想不出怎么又并发又顺序的写入文件,只能取个巧,先把每章单独编号写到单独的txt文件里,再按顺序合并到单一的txt里

package main

import (
	"bufio"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"github.com/fesiong/goproject/convert"
	"io/ioutil"
	"log"
	"os"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"time"
)
const urlRe = `\d{7,}.html`
const contentRe = `(?U:<br>[\s\S]+</div>)`

type Book struct {
	BookUrl string
	BookTitle string
	Chapters []Chapter
}
type Chapter struct {
	Id int
	ChapterUrl string
}

func main() {

	book:=&Book{}
	book.BookUrl="https://www.ptwxz.com/html/10/10043/"
	book.ParseChapters(Fetch(book.BookUrl))

}

func Fetch(url string) string {
	res, err := convert.Request(url, &convert.Options{})
	res.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
	if err != nil {
		log.Fatal(err)
	}
	return res.Body
}

func (book *Book) ParseChapters(html string,){
	start := time.Now()
	book.BookTitle = selectTitle(html)
	book.BookTitle=strings.ReplaceAll(book.BookTitle, "最新章节", "")
	rep := regexp.MustCompile(urlRe)             //匹配正则
	match := rep.FindAllStringSubmatch(html, -1) //返回字符串切片,-1表示全部,1表示匹配1个,2就匹配2个
	for k, v := range match {
		book.Chapters=append(book.Chapters,Chapter{
			Id:         k,
			ChapterUrl: book.BookUrl+v[0],
		})
	}
	wg:=&sync.WaitGroup{}
	//mu:=&sync.Mutex{}
	jobchan := make(chan *Chapter)
	go createWork(20,jobchan,wg)
	for _, m := range book.Chapters {
			c:=m//这里一定要重新赋值,要不然channel里是m的地址,导致多个协程里channel都是一样的
			jobchan <- &c
			wg.Add(1)

	}
	wg.Wait()
	combinefiles(book.BookTitle,len(book.Chapters))
	fmt.Printf("time:%.2fs\n",time.Since(start).Seconds())



}

func Parsecontent(html string) (string){
	rep := regexp.MustCompile(contentRe)             //匹配正则
	match := rep.FindString(html) //返回字符串切片
	title := selectTitle(html)
	match = strings.ReplaceAll(match, "&nbsp;", "")
	match = strings.ReplaceAll(match, "<br />", "\n")
	match = strings.ReplaceAll(match, "<br>", "")
	match = strings.ReplaceAll(match, "\n\n", "\n")
	match = strings.ReplaceAll(match, "</div>", "")
	fmt.Println("craw:"+title)
	return title+"\n"+match
}

func writefile(f string,title string) { //创建txt文件再写入
	filePath := title+".txt"
	file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE, 0666)
	if err != nil {
		fmt.Println("open file err", err)
	}
	//写入文件时,使用带缓存的 *Writer
	write := bufio.NewWriter(file)
	_,err=write.WriteString(f)
	if err != nil {
		fmt.Println(err.Error())
	}
	//Flush将缓存的文件真正写入到文件中
	err=write.Flush()
	if err != nil {
		fmt.Println(err.Error())
	}
	defer file.Close()
}
func selectTitle(html string) string {
	dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
	if err != nil {
		log.Fatalln(err)
	}
	var title string
	dom.Find("h1").Each(func(i int, selection *goquery.Selection) {
		title = selection.Text()
	})
	return title
}
func createWork(num int, jobChan chan *Chapter, wg *sync.WaitGroup) {
	// 根据开协程个数,去跑运行
	for i := 0; i < num; i++ {
		go func(jobChan chan *Chapter, wg *sync.WaitGroup) {
			//mu.Lock()
			for job := range jobChan {
				contents:= Parsecontent(Fetch(job.ChapterUrl)) //开始爬取对应的页面
				writefile(contents,strconv.Itoa(job.Id+1))
				fmt.Printf("正在写:%d.txt\n",job.Id+1)
				wg.Done()
			}
			//mu.Unlock()
		}(jobChan, wg)
	}
}

func combinefiles(destination string,lens int){
	filePath := destination+".txt"
	file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		fmt.Println("open file err", err)
	}
	//写入文件时,使用带缓存的 *Writer
	write := bufio.NewWriter(file)
	for i := 0; i <lens; i++ {
		sourcepath:= strconv.Itoa(i+1)+".txt"
		content ,err :=ioutil.ReadFile(sourcepath)//小说不大,读取到内存就好了
		if err !=nil {
			panic(err)
		}
		_,err=write.WriteString(string(content))
		if err != nil {
			fmt.Println(err.Error())
		}
		//Flush将缓存的文件真正写入到文件中
		err=write.Flush()
		if err != nil {
			fmt.Println(err.Error())
		}
		err = os.Remove(sourcepath)
		if err != nil {
			fmt.Println("file remove Error!")
		}
	}

	defer file.Close()
}
 收藏 (0) 打赏

您可以选择一种方式赞助本站

支付宝扫一扫赞助

微信钱包扫描赞助

未经允许不得转载:家里蹲的狐狸 » go语言学习例子No.32-爬取小说并发

分享到: 生成海报
avatar

评论 抢沙发

  • QQ号
  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

登录

忘记密码 ?

切换登录

注册

我们将发送一封验证邮件至你的邮箱, 请正确填写以完成账号注册和激活