crawler google search rank

使用 selenium 來爬查詢關鍵字的前幾名網站

build.sbt

enablePlugins(ScalaJSPlugin)

name := "scala-starter"
scalaVersion := "2.12.3"

libraryDependencies += "org.jsoup" % "jsoup" % "1.11.2"

libraryDependencies ++= Seq(
  "org.scalatest" %% "scalatest" % "3.0.1" % "test",
  "org.seleniumhq.selenium" % "selenium-java" % "3.10.0",
  "org.scala-js" %%% "scalajs-dom" % "0.9.1",
  "be.doeraene" %%% "scalajs-jquery" % "0.9.1"
)
scalaJSUseMainModuleInitializer := true
scalaSource in Test := baseDirectory.value / "test"

/crawler/project/plugin.sbt

addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.22")

GoogleKeyWordsCrawler.scala

package scalastarter

import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths, StandardOpenOption}
import java.util.concurrent.TimeUnit

import org.openqa.selenium.By
import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions}

import scala.util.Random

object GoogleKeyWordsCrawler {

  def main(args: Array[String]): Unit = {
    val r = new Random
    val crawlerKeywords = "/Users/daniel/1-project/2-ght/crawler/crawler/ref/crawlerKeywords.csv"
    for (line <- scala.io.Source.fromFile(crawlerKeywords).getLines) {
      crawler(line)
      Thread.sleep(40000 + r.nextInt(20000)+1)
    }
  }

  def crawler(line: String): Unit = {
    val outputFile = "/Users/daniel/1-project/2-ght/crawler/crawler/output/webInfo.csv"
    System.setProperty("webdriver.chrome.driver", "/Users/daniel/1-project/2-ght/crawler/crawler/ref/chromedriver")
    val r = new Random
    val options:ChromeOptions = new ChromeOptions()
    // options.addArguments("--headless")// 可以關掉視窗
    val driver = new ChromeDriver(options)
    val lineSplit = line.split(",")
    val no = lineSplit(0)
    val keywords = lineSplit(1)
    var adForSearch = keywords.replaceAll("\\\\|/|:|\\*|\\?|<|>|\\||&|'\"", "+")
    adForSearch = keywords.replaceAll(" ", "+")
    try {
      var url = "NA"
      driver.manage.timeouts.implicitlyWait(r.nextInt(3), TimeUnit.SECONDS)
      //gl等於geolocation,num100
      driver.get("https://www.google.com/search?q=" + adForSearch +"&gl=tw&num=100")
      val results = driver.findElements(By.cssSelector("div.rc > div.r > a"))
      val l = results.toArray.length
      for (i <- 0 until l) {
        url = results.get(i).getAttribute("href")
        val fileInfo = no + ";" + keywords + ";" + url + "\n"
        appendFile(outputFile,fileInfo)
      }
      driver.quit()
    } catch {
      case e: Exception =>
        e.printStackTrace()
        driver.quit()
    }
  }

  def appendFile(outputFile:String , fileInfo:String): Unit = {
    Files.write(Paths.get(outputFile), fileInfo.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND)
  }

}

執行結果 :

1;java;https://www.java.com/zh_TW/
1;java;https://zh.wikipedia.org/zh-tw/Java
1;java;https://www.oracle.com/technetwork/java/index.html
1;java;https://translate.google.com/translate?hl=zh-TW&sl=en&u=https://www.oracle.com/technetwork/java/index.html&prev=search
1;java;https://www.oracle.com/technetwork/java/javase/downloads/index.html
1;java;https://translate.google.com/translate?hl=zh-TW&sl=en&u=https://www.oracle.com/technetwork/java/javase/downloads/index.html&prev=search
1;java;https://www.ithome.com.tw/voice/126265
1;java;https://programming.im.ncnu.edu.tw/J_index.html
1;java;http://www.codedata.com.tw/book/java-basic/index.php
...