Go Senior Engineer Lecture (MOOC) 005

005 Standard Library

http

  • Using http client to send requests
  • Using http.Client to control request headers
  • Using httputil to simplify work
package main

import (
    "fmt"
    "net/http"
    "net/http/httputil"
)

func main() {
    resp, err := http.Get("https://www.baidu.com")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()
    s, err := httputil.DumpResponse(resp, true)
    if err!=nil{
        panic(err)
    }
    fmt.Printf("%s\n",s)
}

Set mobile browsing set user-agent

package main

import (
    "fmt"
    "net/http"
    "net/http/httputil"
)

func main() {
    request, err := http.NewRequest(http.MethodGet, "https://www.baidu.com", nil)
    request.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36")

    resp, err := http.DefaultClient.Do(request) //http.Get("https://www.baidu.com")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()
    s, err := httputil.DumpResponse(resp, true)
    if err != nil {
        panic(err)
    }
      fmt.Printf("%s\n", s)
}

go tool pprof http://localhost:8888/debug/pprof/profile
or by importing a package_ "net/http/pprof" to start a web service, enter http://localhost:8888/debug/pprof in the address bar

Other Standard Libraries

  • bufio
  • log
  • encoding/json
  • regexp
  • time (channel usage)
  • strings/math/rand

Refer to the standard library documentation
godoc -http :8888 start a local service
https://studygolang.com/pkgdoc

Maze Algorithm

Breadth-First Search (BFS) algorithm (widely applicable, highly comprehensive)
Discovered but not yet explored, put in a queue, explore until the end, then walk backward for the shortest path

breadth

package main

import (
      "fmt"
      "os"
)

//返回二维数组
func readMaze(filename string) [][]int {
      file, err := os.Open(filename)
      if err != nil {
            panic(err)
      }
      defer file.Close()
      var row, col int
      fmt.Fscanf(file, "%d %d", &row, &col)
      maze := make([][]int, row) // row行
      for i := range maze {
            maze[i] = make([]int, col)
            for j := range maze[i] {
                  fmt.Fscanf(file, "%d", &maze[i][j])
            }
      }
      return maze
}

type point struct {
      i, j int
}

func (p point) add(r point) point {
      return point{p.i + r.i, p.j + r.j}
}

func (p point) at(grid [][]int) (int, bool) { //bool代表是否越界
      if p.i < 0 || p.i >= len(grid) {
            return 0, false
      }
      if p.j < 0 || p.j >= len(grid[p.i]) {
            return 0, false
      }
      return grid[p.i][p.j], true
}

//四个方向 上左下右
var dirs = [4]point{
      {-1, 0},
      {0, -1},
      {1, 0},
      {0, 1},
}

func walk(maze [][]int, start, end point) [][]int {
      steps := make([][]int, len(maze))
      for i := range steps {
            steps[i] = make([]int, len(maze[i]))
      }
      //   队列
      Q := []point{start}
      //队列不空才去探索
      for len(Q) > 0 {
            cur := Q[0]
            Q = Q[1:]
            if (cur == end) {
                  break
            }
            for _, dir := range dirs {
                  next := cur.add(dir)
                  //      maze at next is 0
                  // and steps at next is 0 曾经到过
                  //      and next !=start
                  val, ok := next.at(maze)
                  if !ok || val == 1 {
                        continue
                  }
                  val, ok = next.at(steps)
                  if !ok || val != 0 { //走过了
                        continue
                  }
                  if next == start {
                        continue
                  }
                  //      steps 填进去
                  curSteps, _ := cur.at(steps)
                  steps[next.i][next.j] = curSteps + 1
                  Q = append(Q, next)
            }

      }
      return steps

}

func main() {
      maze := readMaze("maze/maze.in")
      for _, row := range maze {
            for _, val := range row {
                  fmt.Printf("%3d ", val)
            }
            fmt.Println()
      }
      fmt.Println()
      steps := walk(maze, point{0, 0}, point{len(maze) - 1, len(maze[0]) - 1})
      for _, row := range steps {
            for _, val := range row {
                  fmt.Printf("%3d", val)
            }
            fmt.Println()
      }
}
// 途经的点可以倒序遍历一下

Simple Web Crawler

  • General-purpose crawlers e.g., Baidu, Google
  • Focused crawlers to obtain structured data from the internet
  • Go language crawler libraries/frameworks
  • henrylee2cn/pholcus
  • gocrawl
  • colly
  • hu17889/go_spider

Technology selection, crawler topics (e.g., news, blogs, communities, we crawl people, QQ, Renren, Weibo, Facebook, dating sites, job search sites)

  • ElasticSearch as data storage
  • Go language standard template library to implement the http data display part

Single-task Web Crawler

Get and print detailed information of users on the first page of all cities
Transcoding go get -g -v golang.org/x/text
Automatically detect web page encoding go get -g -v golang.org/x/net/html

package main

import (
    "bufio"
    "fmt"
    "io"
    "io/ioutil"
    "net/http"

    "golang.org/x/net/html/charset"
    "golang.org/x/text/encoding"
    "golang.org/x/text/transform"
)

func main() {
    //所有城市第一页用户
    resp, err := http.Get("http://www.zhenai.com/zhenghun")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        fmt.Println("Error: ", resp.StatusCode)
        //panic("Status Code:")
        return
    }
    //自动转码
    e := determineEncoding(resp.Body)
    utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
    all, err := ioutil.ReadAll(utf8Reader)
    if err != nil {
        panic(err)
    }
    fmt.Printf("%s\n", all)

}

func determineEncoding(r io.Reader) encoding.Encoding {
    bytes, err := bufio.NewReader(r).Peek(1024)
    if err != nil {
        panic(err)
    }
    e, _, _ := charset.DetermineEncoding(bytes, "")
    return e
}

Regular Expressions

package main

import (
    "fmt"
    "regexp"
)

const text = "My email is ccmouse@gmail.com"

func main() {
    //re := regexp.MustCompile("ccmouse@gmail.com")
    //用反引号,正则的特殊符号不用再转意了
    re := regexp.MustCompile(`[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+`)
    match := re.FindString(text)
    fmt.Println(match)
}

Read JSON file and parse into corresponding objects

package main

import (
    "encoding/json"
    "fmt"
    "os"

)

type CityObj struct {
    LinkContent string `json:"linkContent"`
    LinkURL string
}

type CityGroup struct {
    CityList []CityObj
    Order string
}

type RtnData struct {
    CityData []CityGroup
}

func main() {
    path,_:=os.Getwd()
    fmt.Println(path)
    //var jObj interface{}
    var jObj RtnData
    file, err := os.Open("./cities.json")
    if err != nil {
        panic(err)
    }
    //file stat
    fi, _ := file.Stat()
    buffer := make([]byte, fi.Size())
    _, err = file.Read(buffer)
    if err != nil {
        panic(err)
    }
    err = json.Unmarshal(buffer, &jObj)
    fmt.Println(jObj.CityData[0].CityList[0])
}

Crawl data using regular expressions

package main

import (
    "bufio"
    "fmt"
    "io"
    "io/ioutil"
    "net/http"
    "regexp"

    "golang.org/x/net/html/charset"
    "golang.org/x/text/encoding"
    "golang.org/x/text/encoding/unicode"
    "golang.org/x/text/transform"
)

func main() {
    //所有城市第一页用户
    url := "http://www.zhenai.com/zhenghun"
    resp, err := http.Get(url)
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        fmt.Println("Error: ", resp.StatusCode)
        panic("Status Code:")

    }
    //自动转码
    e := determineEncoding(resp.Body)
    utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
    all, err := ioutil.ReadAll(utf8Reader)
    if err != nil {
        //panic(err)
        fmt.Printf("%v\n", err)
        return
    }

    //fmt.Printf("%s\n", all)
    printCityList(all)

}

func determineEncoding(r io.Reader) encoding.Encoding {
    bytes, err := bufio.NewReader(r).Peek(1024)
    if err != nil {
        //panic(err)
        return unicode.UTF8
    }
    e, _, _ := charset.DetermineEncoding(bytes, "")
    return e
}

func printCityList(contents []byte) {
    //re := regexp.MustCompile(`<a target="_blank" href="http://www.zhenai.com/zhenghun/[0-9a-z]+"[^>]*>[^<]+</a>`)
    re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+"[^>]*>[^<]+</a>`) // 470个城市
    matches := re.FindAll(contents, -1)
    for _,m:=range matches{
        fmt.Printf("%s\n",m)
    }
    fmt.Printf("Matches found: %d\n",len(matches))
}

single_task

When crawling data, pay attention to the request header settings

  1. Declare http.Clien{}
  2. Declare request,err:=http.NewRequest("GET",url,nil)
  3. request.Header.Add
  4. Replace http.Get(url) with resp, err := client.Do(request)
  5. defer resp.Body.Close()

Up to this point for the single-task version, check the content of the directories engin fetcher model zhenai

Concurrent Version

The output of the Fetcher is the input of the Parser, which can be extracted into a module

concurrent_worker.jpg

Scheduler Implementation 1: All Workers share one input

See branchsimplechen

Implementation 2 There are two queues: request queue and worker queue

concurrent_worker_v2

Put requests into the request queue, create a goroutine for each request, then let all workers contend for a channel.

Other Pages

High repetition rate, crawl a city, for example, Shanghai

e.Run(engine.Request{
    Url:"http://www.zhenai.com/zhenghun/shanghai",
    ParserFunc: parser.ParseCity,
})

URL Deduplication

  • Hash table (direct storage, takes up space - used in this course)
  • Calculate MD5 or other hashes, then store in a hash table
  • Use Bloom filter multi-hash structure
  • Use Redis or other key-value storage systems to achieve distributed deduplication

Profile Saving

Abstract the concept of Task: FetchTask, PersistTask share one Engine, Scheduler, requiring FetchWorker, PersistWorker to be created; this seems too heavy for this project. Create a goroutine for each Item and submit it to ItemSaver; ItemSaver is faster than Fetcher, a method similar to SimpleScheduler will suffice, and we will adopt this method.

saver

Elastic and Docker

Elastic full-text search engine
This tutorial uses Docker to integrate Elastic install doc

docker pull docker.elastic.co/elasticsearch/elasticsearch:6.7.2

 docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9200:9200 -p 9300:9300 -v /home/soft/ES/config/es1.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data1:/usr/share/elasticsearch/data --name ES01 elasticsearch:5.6.8

 docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9201:9201 -p 9301:9301 -v /home/soft/ES/config/es2.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data2:/usr/share/elasticsearch/data --name ES02 elasticsearch:5.6.8

 docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9202:9202 -p 9302:9302 -v /home/soft/ES/config/es3.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data3:/usr/share/elasticsearch/data --name ES03 elasticsearch:5.6.8

#   本例运行如下
docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9200:9200 -p 9300:9300 -v /Users/willzhao/Documents/Develop/docker/elasticsearch/6.8/data:/usr/share/elasticsearch/data --name ES_6.8 elasticsearch:6.8.0
# http://localhost:9200/ 可以访问

For example, we create a 'course' database
To add records, we can use PUT or POST course/1 course/2 to add JSON data, we can use GET to retrieve all data, or use GET with parameterscourse/_search?,
If no ID is specified, POST must be used

  • <server>:9200/index/type/id
  • index is equivalent to a database
  • type is equivalent to a table
  • We don't need to pre-create index and type (IK plugin, learn about it if you have time)
  • Use REST interface
  • PUT/POST to create/modify data; using POST allows omitting the ID
  • GET to retrieve data
  • or GET <index>/<type>/_search?q='参数'

The save() in this lesson useselasticsearch client
Go community version
https://gopkg.in/olivere/elastic.v6

elastic_client.jpg

# 先安装v6(我认为是基础
go get -v -u github.com/olivere/elastic
# 引这个的包 本例中用的是v6 在查询全
# 这个是v6的client https://olivere.github.io/elastic/ 请仔细阅读这个文档
go get gopkg.in/olivere/elastic.v6

html/template

Distributed Systems

  • Multiple nodes
  • Fault tolerance
  • Scalability (performance)
  • Inherent distribution
  • Message passing
  • Nodes have private storage
  • Easy to develop
  • Scalability (functionality)
  • Comparison: Parallel computing
  • Message passing methods
    • RESt
    • RPC
    • Middleware (messageQ)
  • Use cases
    • External: Rest
    • Internal to modules: RPC
    • Between modules: Middleware, REST
  • Fulfill specific requirements

Distributed Architecture vs. Microservice Architecture

Distributed: Guides how nodes communicate with each other
Microservices: Encourages dividing modules by business domain

Multi-tier Architecture vs. Microservice Architecture

Microservice architecture has more "services"
Microservices usually require automated testing, deployment, and service discovery

Distributed Web Crawler

  • Rate limiting issues
  • Limited traffic capacity for a single node
  • Place workers on different nodes
  • Deduplication issues
  • Distributed deduplication places deduplication on workers
  • Data storage issues

jsonRPC

package rpcdemo

import "errors"

// Service.Method

type DemoService struct {}

type Args struct {
    A,B int
}

func (DemoService) Dive(args Args,result *float64) error  {
    if args.B==0 {
        return errors.New("division by zero.")
    }
     *result = float64(args.A)/float64(args.B)
     return nil
}
// server
package main

import (
    rpcdemo "gobasic/rpc"
    "log"
    "net"
    "net/rpc"
    "net/rpc/jsonrpc"
)

func main() {
    rpc.Register(rpcdemo.DemoService{})

    listener, err := net.Listen("tcp", ":1234")
    if err != nil {
        panic(err)
    }
    for {
        conn, err := listener.Accept()
        if err != nil {
            log.Printf("accept error: %v", err)
            continue
        }

        go jsonrpc.ServeConn(conn)
    }
}

You can test it with telnet localhost 1234, send JSON data as follows

{"method":"DemoService.Dive","params":[{"A":3,"B":4}],"id":1}
// 返回结果
{"id":1,"result":0.75,"error":null}

Create a client to call

// client/main.go
package main

import (
    "fmt"
    rpcdemo "gobasic/rpc"
    "net"
    "net/rpc/jsonrpc"
)

func main() {
    conn, err := net.Dial("tcp", ":1234")

    if err != nil {
        panic(err)
    }
    client := jsonrpc.NewClient(conn)
    var result float64
    err = client.Call("DemoService.Dive", rpcdemo.Args{10, 3}, &result)
    fmt.Println(result, err)

    err = client.Call("DemoService.Dive", rpcdemo.Args{10, 0}, &result)
    fmt.Println(result, err)
}

主题测试文章,只做测试使用。发布者:Walker,转转请注明出处:https://walker-learn.xyz/archives/6742

(0)
Walker的头像Walker
上一篇 12 hours ago
下一篇 Nov 25, 2025 12:00

Related Posts

EN
简体中文 繁體中文 English