• XSS.stack #1 – первый литературный журнал от юзеров форума

Google SERP Scraper

DimmuBurgor

CPU register
Пользователь
Регистрация
01.12.2021
Сообщения
1 504
Решения
1
Реакции
552
Гарант сделки
6
Written in GO Lang w/ ChromeDP dependency
go get -t -d github.com/chromedp/chromedp and github.com/chromedp/cdproto
next version will include improved headless functionality and background headless false functionality as well as blocking pagination. If you have a feature request add it to the comments below

Код:
package main

import (
    "bufio"
    "context"
    "encoding/json"
    "log"
    "net/url"
    "os"
    "path/filepath"
    "time"

    "github.com/chromedp/cdproto/cdp"
    "github.com/chromedp/chromedp"
)

type Result struct {
    Link string `json:"link"`
}

func main() {
    const (
        port = 9515
    )

opts := append(chromedp. DefaultExecAllocatorOptions[:],
chromedp. Flag("headless", false),
chromedp. Flag("no-sandbox", true),
chromedp. Flag("disable-dev-shm-usage", true),
    )

baseCtx, cancel := context. WithTimeout(context. Background(), 99*time. Second)
    defer cancel()

browserCtx, cancel := chromedp. NewExecAllocator(baseCtx, opts...)
    defer cancel()

tabCtx, cancel := chromedp. NewContext(browserCtx)
    defer cancel()

.log. Print("Enter search query: ")
scanner := bufio. NewScanner(os. Stdin)
scanner. Scan()
input := scanner. Text()

googleURL := url. URL{
        Scheme:   "https",
        Host:     "www.google.com",
        Path:     "/search",
RawQuery: url. Values{"q": {input}, "filter": {"0"}, "num": {"1000"}}. Encode(),
    }

err := chromedp. Run(tabCtx,
chromedp. Navigate(googleURL.String()),
chromedp. WaitReady("div#search"),
    )
    if err != nil {
.log. Fatal(err)
    }

var doc []*cdp. Node
err = chromedp. Run(tabCtx,
chromedp. Nodes("div#search div.g a", &doc),
    )
    if err != nil {
.log. Fatal(err)
    }

    results := []Result{}
    for _, r := range doc {
        deadline, ok1 := baseCtx.Deadline()
if ok1 && time. Now(). After(deadline) {
.log. Println("Context deadline exceeded")
            break
        }
        if err := baseCtx.Err(); err != nil {
.log. Println("Context canceled")
            break
        }

        var href string
        var ok2 bool
err = chromedp. Run(tabCtx,
chromedp. AttributeValue(r.FullXPath(), "href", &href, &ok2),
        )
        if err != nil {
.log. Println(err)
            continue
        }

        if ok2 && href != "" {
            results = append(results, Result{Link: href})
        }
    }

.log. Print("Enter output file path (including filename): ")
scanner. Scan()
filePath := scanner. Text()

filePath, err = filepath. Abs(filePath)
    if err != nil {
.log. Fatal(err)
    }

dir := filepath. Dir(filePath)

err = os. MkdirAll(dir, os. ModePerm)
    if err != nil {
.log. Fatal(err)
    }

file, err := os. Create(filePath)
    if err != nil {
.log. Fatal(err)
    }
defer file. Close()

encoder := json. NewEncoder(file)
encoder. SetIndent("", " ")
err = encoder. Encode(results)
    if err != nil {
.log. Fatal(err)
    }

.log. Printf("Results saved to %s", filePath)

time. Sleep(10 * time. Second)

    cancel()
}
{/code]
 
UPDATES:

  • Added Pagination using URL text fragment Marker method*
  • Wrote a Discrete Recurrence Relation**
  • Randomized Sleep Timer
  • General Overall Improvement
* - Almost all results get scraped to output dest
** - P(n) = min(100, T - P(1) - P(2) - … - P(n-1)), with P(0) = 0
Код:
package main

import (
    "bufio"
    "context"
    "encoding/json"
    "fmt"
    "log"
    "math/rand"
    "net/url"
    "os"
    "path/filepath"
    "strconv"
    "strings"
    "time"

    "github.com/chromedp/cdproto/cdp"
    "github.com/chromedp/chromedp"
)

type Result struct {
    Link string `json:"link"`
}

const (
    port        = 9515
    sleepMin    = 11
    sleepMax    = 45
    queryNum    = 100
    progressLen = 50
)

func main() {
    opts := []chromedp.ExecAllocatorOption{
        chromedp.Flag("headless", false),
        chromedp.Flag("window-size", "1280,720"),
        chromedp.Flag("window-position", "0,0"),
        chromedp.Flag("disable-infobars", true),
        chromedp.Flag("disable-extensions", true),
        chromedp.Flag("hide-scrollbars", true),
        chromedp.Flag("mute-audio", true),
        chromedp.Flag("disable-notifications", true),
        chromedp.Flag("disable-gpu", true),
        chromedp.Flag("no-sandbox", true),
        chromedp.Flag("disable-dev-shm-usage", true),
        chromedp.Flag("disable-features", "site-per-process"),
        chromedp.Flag("disable-background-networking", true),
        chromedp.Flag("disable-sync", true),
        chromedp.Flag("disable-background-timer-throttling", true),
        chromedp.Flag("disable-backgrounding-occluded-windows", true),
        chromedp.Flag("disable-ipc-flooding-protection", true),
        chromedp.Flag("disable-breakpad", true),
        chromedp.Flag("disable-client-side-phishing-detection", true),
        chromedp.Flag("disable-component-extensions-with-background-pages", true),
        chromedp.Flag("disable-default-apps", true),
        chromedp.Flag("disable-features", "TranslateUI,BlinkGenPropertyTrees"),
        chromedp.Flag("disable-hang-monitor", true),
        chromedp.Flag("disable-ipc-flooding-protection", true),
        chromedp.Flag("disable-popup-blocking", true),
        chromedp.Flag("disable-prompt-on-repost", true),
        chromedp.Flag("disable-renderer-backgrounding", true),
        chromedp.Flag("disable-sync-preferences", true),
        chromedp.Flag("disable-web-security", true),
        chromedp.Flag("enable-automation", true),
        chromedp.Flag("force-color-profile", "srgb"),
        chromedp.Flag("metrics-recording-only", true),
        chromedp.Flag("password-store", "basic"),
        chromedp.Flag("use-mock-keychain", true),
        chromedp.Flag("remote-debugging-port", strconv.Itoa(port)),
        chromedp.Flag("hide-scrollbars", true),
        chromedp.Flag("log-level", "info"),
    }

    baseCtx := context.Background()
    ctx, cancel := chromedp.NewExecAllocator(baseCtx, opts...)
    defer cancel()

    ctx, cancel = chromedp.NewContext(ctx)
    defer cancel()

    log.Print("Enter search query: ")
    scanner := bufio.NewScanner(os.Stdin)
    scanner.Scan()
    input := scanner.Text()

    start := 0
    results := []Result{}
    totalResults := 0
    estimatedTotal := 0

    for {
        googleURL := buildGoogleURL(input, start)

        err := chromedp.Run(ctx,
            chromedp.Navigate(googleURL.String()),
            chromedp.WaitReady("div#search"),
        )
        if err != nil {
            log.Printf("Error navigating to Google search: %v", err)
            break
        }

        var doc []*cdp.Node
        err = chromedp.Run(ctx,
            chromedp.Nodes("div#search div.g a", &doc),
        )
        if err != nil {
            log.Printf("Error retrieving search results: %v", err)
            break
        }

        totalResults += len(doc)
        estimatedTotal = calculateEstimatedTotal(totalResults, start)

        for _, r := range doc {
            deadline, ok1 := ctx.Deadline()
            if ok1 && time.Now().After(deadline) {
                log.Println("Context deadline exceeded")
                break
            }
            if err := ctx.Err(); err != nil {
                log.Println("Context canceled")
                break
            }

            var href string
            var ok2 bool
            err = chromedp.Run(ctx,
                chromedp.AttributeValue(r.FullXPath(), "href", &href, &ok2),
            )
            if err != nil {
                log.Println(err)
                continue
            }

            if ok2 && href != "" {
                results = append(results, Result{Link: href})
            }
        }

        if len(doc) < queryNum {
            break
        }

        start += queryNum

        sleepRandom(sleepMin, sleepMax)

        printProgress(totalResults, estimatedTotal)
    }

    log.Print("Enter output file path (including filename): ")
    scanner.Scan()
    filePath := scanner.Text()

    filePath, err := filepath.Abs(filePath)
    if err != nil {
        log.Printf("Error getting absolute file path: %v", err)
        return
    }

    dir := filepath.Dir(filePath)

    err = os.MkdirAll(dir, os.ModePerm)
    if err != nil {
        log.Printf("Error creating directory: %v", err)
        return
    }

    file, err := os.Create(filePath)
    if err != nil {
        log.Printf("Error creating file: %v", err)
        return
    }
    defer file.Close()

    encoder := json.NewEncoder(file)
    encoder.SetIndent("", "  ")
    err = encoder.Encode(results)
    if err != nil {
        log.Printf("Error encoding JSON: %v", err)
        return
    }

    log.Printf("Results saved to %s", filePath)
}

func buildGoogleURL(query string, start int) *url.URL {
    googleURL := url.URL{
        Scheme: "https",
        Host:   "www.google.com",
        Path:   "/search",
    }

    queryValues := url.Values{}
    queryValues.Set("q", query)
    queryValues.Set("filter", "0")
    queryValues.Set("num", strconv.Itoa(queryNum))

    if start > 0 {
        queryValues.Set("start", strconv.Itoa(start))
    }

    googleURL.RawQuery = queryValues.Encode()
    return &googleURL
}

func calculateEstimatedTotal(totalResults, start int) int {
    return totalResults + start + queryNum
}

func printProgress(current, estimatedTotal int) {
    percentage := float64(current) / float64(estimatedTotal) * 100
    progressBar := strings.Repeat("█", int(percentage)/2)
    fmt.Printf("Progress: [%-50s] %.2f%%\r", progressBar, percentage)
}

func sleepRandom(min, max int) {
    sleepTime := time.Duration(rand.Intn(max-min+1)+min) * time.Second
    time.Sleep(sleepTime)
}
 
Next update will fix the progress bar, Choose JS/Click or URL Marker, Headless and Hidden mode + ...?
 


Напишите ответ...
  • Вставить:
Прикрепить файлы
Верх