Saturday, June 6, 2015

Day 16 - Comparison of html parsers for a webcrawler, today GoQuery

github.com/goquery is the king of go html parsing. After trying the two other
relevant libs i can conclude:

  • GoQuery: All you want, you can use it as a Ferrari or as a heavy load truck
  • Scrape: Small, light and neat: Its your bicycle, it always works
  • go-html-transform: For me it feels like an energy plant with all buttons in "русский этикетки"


This example to scrape posts from hackernews has been stripped down to
fit onto one page. All error handling and debug printouts have been
removed, so only the pure GoQuery logic remains.
The full source is available at github.com/hackernewsCrawlerGoQuery.


// Parse for posts in html from hackernews, input html is an io.Reader 
// and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post)  
                        (psout []*post.Post, err error) { 
    // Create a qoquery document to parse from an io.Reader 
    doc, err := goquery.NewDocumentFromReader(body)
    // Find hackernews posts = elements with class "athing"
    thing := doc.Find(".athing")
    for iThing := range thing.Nodes {
        // Create a new post struct - if the crawling fails 
        // the post will have its Err set, but will be added  
        // to the outgoing (psout) slice nevertheless
        post := post.NewPost()
        ps = append(ps, &post)
        // use singlearticle as a selection of one single post
        singlearticle := thing.Eq(iThing)
        // Get the next element containing additional info for this post
        scorenode := singlearticle.Next()
        // Get the post title
        htmlpost := singlearticle.Find(".title a").First()
        post.Title = htmlpost.Text()
        // Get the post url
        post.Url, exists = htmlpost.Attr("href")
        // Get the post score
        scoretag := scorenode.Find(".subtext .score").First()
        post.SetScore(strings.Split(scoretag.Text(), " ")[0])
        // Get the post id
        postid, exists := scoretag.Attr("id")
        post.PostId = strings.Split(postid, "_")[1]
        // Get the username and postdate
        hrefs := scorenode.Find(".subtext a")
        for i := range hrefs.Nodes {
            href := hrefs.Eq(i)
            t, _ := href.Html()
            s, exists := href.Attr("href")
            if exists {
                if strings.HasPrefix(s, "user?id") {
                    post.User = t
                    continue
                }
                if strings.HasPrefix(s, "item?id") {
                    if strings.Contains(t, "ago") {
                        var postDate time.Time
                        postDate, err = GetDateFromCreatedAgo(t)
                        post.PostDate = postDate
                        post.Err = nil
                    }
                }
            }
        }
    }
    return ps, err
}

1 comment:

  1. Nice, i already love GoQuery, thanks for confirmation. Also the scrape demo is usefull as there are not much examples using scrape out there.

    ReplyDelete