005 Standard Library
http
- Using http client to send requests
- Using http.Client to control request headers
- Using httputil to simplify work
package main
import (
"fmt"
"net/http"
"net/http/httputil"
)
func main() {
resp, err := http.Get("https://www.baidu.com")
if err != nil {
panic(err)
}
defer resp.Body.Close()
s, err := httputil.DumpResponse(resp, true)
if err!=nil{
panic(err)
}
fmt.Printf("%s\n",s)
}
Set mobile browsing set user-agent
package main
import (
"fmt"
"net/http"
"net/http/httputil"
)
func main() {
request, err := http.NewRequest(http.MethodGet, "https://www.baidu.com", nil)
request.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36")
resp, err := http.DefaultClient.Do(request) //http.Get("https://www.baidu.com")
if err != nil {
panic(err)
}
defer resp.Body.Close()
s, err := httputil.DumpResponse(resp, true)
if err != nil {
panic(err)
}
fmt.Printf("%s\n", s)
}
go tool pprof http://localhost:8888/debug/pprof/profile
or by importing a package_ "net/http/pprof"to start a web service, enterhttp://localhost:8888/debug/pprofin the address bar
Other Standard Libraries
- bufio
- log
- encoding/json
- regexp
- time (channel usage)
- strings/math/rand
Refer to the standard library documentationgodoc -http :8888 start a local servicehttps://studygolang.com/pkgdoc
Maze Algorithm
Breadth-First Search (BFS) algorithm (widely applicable, highly comprehensive)
Discovered but not yet explored, put in a queue, explore until the end, then walk backward for the shortest path
package main
import (
"fmt"
"os"
)
//返回二维数组
func readMaze(filename string) [][]int {
file, err := os.Open(filename)
if err != nil {
panic(err)
}
defer file.Close()
var row, col int
fmt.Fscanf(file, "%d %d", &row, &col)
maze := make([][]int, row) // row行
for i := range maze {
maze[i] = make([]int, col)
for j := range maze[i] {
fmt.Fscanf(file, "%d", &maze[i][j])
}
}
return maze
}
type point struct {
i, j int
}
func (p point) add(r point) point {
return point{p.i + r.i, p.j + r.j}
}
func (p point) at(grid [][]int) (int, bool) { //bool代表是否越界
if p.i < 0 || p.i >= len(grid) {
return 0, false
}
if p.j < 0 || p.j >= len(grid[p.i]) {
return 0, false
}
return grid[p.i][p.j], true
}
//四个方向 上左下右
var dirs = [4]point{
{-1, 0},
{0, -1},
{1, 0},
{0, 1},
}
func walk(maze [][]int, start, end point) [][]int {
steps := make([][]int, len(maze))
for i := range steps {
steps[i] = make([]int, len(maze[i]))
}
// 队列
Q := []point{start}
//队列不空才去探索
for len(Q) > 0 {
cur := Q[0]
Q = Q[1:]
if (cur == end) {
break
}
for _, dir := range dirs {
next := cur.add(dir)
// maze at next is 0
// and steps at next is 0 曾经到过
// and next !=start
val, ok := next.at(maze)
if !ok || val == 1 {
continue
}
val, ok = next.at(steps)
if !ok || val != 0 { //走过了
continue
}
if next == start {
continue
}
// steps 填进去
curSteps, _ := cur.at(steps)
steps[next.i][next.j] = curSteps + 1
Q = append(Q, next)
}
}
return steps
}
func main() {
maze := readMaze("maze/maze.in")
for _, row := range maze {
for _, val := range row {
fmt.Printf("%3d ", val)
}
fmt.Println()
}
fmt.Println()
steps := walk(maze, point{0, 0}, point{len(maze) - 1, len(maze[0]) - 1})
for _, row := range steps {
for _, val := range row {
fmt.Printf("%3d", val)
}
fmt.Println()
}
}
// 途经的点可以倒序遍历一下
Simple Web Crawler
- General-purpose crawlers e.g., Baidu, Google
- Focused crawlers to obtain structured data from the internet
- Go language crawler libraries/frameworks
- henrylee2cn/pholcus
- gocrawl
- colly
- hu17889/go_spider
Technology selection, crawler topics (e.g., news, blogs, communities, we crawl people, QQ, Renren, Weibo, Facebook, dating sites, job search sites)
- ElasticSearch as data storage
- Go language standard template library to implement the http data display part
Single-task Web Crawler
Get and print detailed information of users on the first page of all cities
Transcodinggo get -g -v golang.org/x/text
Automatically detect web page encodinggo get -g -v golang.org/x/net/html
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"net/http"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)
func main() {
//所有城市第一页用户
resp, err := http.Get("http://www.zhenai.com/zhenghun")
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("Error: ", resp.StatusCode)
//panic("Status Code:")
return
}
//自动转码
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
all, err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
fmt.Printf("%s\n", all)
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
panic(err)
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
Regular Expressions
package main
import (
"fmt"
"regexp"
)
const text = "My email is ccmouse@gmail.com"
func main() {
//re := regexp.MustCompile("ccmouse@gmail.com")
//用反引号,正则的特殊符号不用再转意了
re := regexp.MustCompile(`[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+`)
match := re.FindString(text)
fmt.Println(match)
}
Read JSON file and parse into corresponding objects
package main
import (
"encoding/json"
"fmt"
"os"
)
type CityObj struct {
LinkContent string `json:"linkContent"`
LinkURL string
}
type CityGroup struct {
CityList []CityObj
Order string
}
type RtnData struct {
CityData []CityGroup
}
func main() {
path,_:=os.Getwd()
fmt.Println(path)
//var jObj interface{}
var jObj RtnData
file, err := os.Open("./cities.json")
if err != nil {
panic(err)
}
//file stat
fi, _ := file.Stat()
buffer := make([]byte, fi.Size())
_, err = file.Read(buffer)
if err != nil {
panic(err)
}
err = json.Unmarshal(buffer, &jObj)
fmt.Println(jObj.CityData[0].CityList[0])
}
Crawl data using regular expressions
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"net/http"
"regexp"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
func main() {
//所有城市第一页用户
url := "http://www.zhenai.com/zhenghun"
resp, err := http.Get(url)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("Error: ", resp.StatusCode)
panic("Status Code:")
}
//自动转码
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body, e.NewDecoder())
all, err := ioutil.ReadAll(utf8Reader)
if err != nil {
//panic(err)
fmt.Printf("%v\n", err)
return
}
//fmt.Printf("%s\n", all)
printCityList(all)
}
func determineEncoding(r io.Reader) encoding.Encoding {
bytes, err := bufio.NewReader(r).Peek(1024)
if err != nil {
//panic(err)
return unicode.UTF8
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
func printCityList(contents []byte) {
//re := regexp.MustCompile(`<a target="_blank" href="http://www.zhenai.com/zhenghun/[0-9a-z]+"[^>]*>[^<]+</a>`)
re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+"[^>]*>[^<]+</a>`) // 470个城市
matches := re.FindAll(contents, -1)
for _,m:=range matches{
fmt.Printf("%s\n",m)
}
fmt.Printf("Matches found: %d\n",len(matches))
}
When crawling data, pay attention to the request header settings
- Declare http.Clien{}
- Declare
request,err:=http.NewRequest("GET",url,nil) request.Header.Add- Replace
http.Get(url)withresp, err := client.Do(request) defer resp.Body.Close()
Up to this point for the single-task version, check the content of the directories
enginfetchermodelzhenai
Concurrent Version
The output of the Fetcher is the input of the Parser, which can be extracted into a module
Scheduler Implementation 1: All Workers share one input
See branchsimplechen
Implementation 2 There are two queues: request queue and worker queue
Put requests into the request queue, create a goroutine for each request, then let all workers contend for a channel.
Other Pages
High repetition rate, crawl a city, for example, Shanghai
e.Run(engine.Request{
Url:"http://www.zhenai.com/zhenghun/shanghai",
ParserFunc: parser.ParseCity,
})
URL Deduplication
- Hash table (direct storage, takes up space - used in this course)
- Calculate MD5 or other hashes, then store in a hash table
- Use Bloom filter multi-hash structure
- Use Redis or other key-value storage systems to achieve distributed deduplication
Profile Saving
Abstract the concept of Task: FetchTask, PersistTask share one Engine, Scheduler, requiring FetchWorker, PersistWorker to be created; this seems too heavy for this project. Create a goroutine for each Item and submit it to ItemSaver; ItemSaver is faster than Fetcher, a method similar to SimpleScheduler will suffice, and we will adopt this method.
Elastic and Docker
Elastic full-text search engine
This tutorial uses Docker to integrate Elastic install doc
docker pull docker.elastic.co/elasticsearch/elasticsearch:6.7.2
docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9200:9200 -p 9300:9300 -v /home/soft/ES/config/es1.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data1:/usr/share/elasticsearch/data --name ES01 elasticsearch:5.6.8
docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9201:9201 -p 9301:9301 -v /home/soft/ES/config/es2.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data2:/usr/share/elasticsearch/data --name ES02 elasticsearch:5.6.8
docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9202:9202 -p 9302:9302 -v /home/soft/ES/config/es3.yml:/usr/share/elasticsearch/config/elasticsearch.yml -v /home/soft/ES/data3:/usr/share/elasticsearch/data --name ES03 elasticsearch:5.6.8
# 本例运行如下
docker run -e ES_JAVA_OPTS="-Xms256m -Xmx256m" -d -p 9200:9200 -p 9300:9300 -v /Users/willzhao/Documents/Develop/docker/elasticsearch/6.8/data:/usr/share/elasticsearch/data --name ES_6.8 elasticsearch:6.8.0
# http://localhost:9200/ 可以访问
For example, we create a 'course' database
To add records, we can use PUT or POSTcourse/1course/2to add JSON data, we can use GET to retrieve all data, or use GET with parameterscourse/_search?,
If no ID is specified, POST must be used
<server>:9200/index/type/id- index is equivalent to a database
- type is equivalent to a table
- We don't need to pre-create index and type (IK plugin, learn about it if you have time)
- Use REST interface
- PUT/POST to create/modify data; using POST allows omitting the ID
- GET to retrieve data
- or
GET <index>/<type>/_search?q='参数'
The save() in this lesson useselasticsearch client
Go community version
https://gopkg.in/olivere/elastic.v6
# 先安装v6(我认为是基础
go get -v -u github.com/olivere/elastic
# 引这个的包 本例中用的是v6 在查询全
# 这个是v6的client https://olivere.github.io/elastic/ 请仔细阅读这个文档
go get gopkg.in/olivere/elastic.v6
html/template
Distributed Systems
- Multiple nodes
- Fault tolerance
- Scalability (performance)
- Inherent distribution
- Message passing
- Nodes have private storage
- Easy to develop
- Scalability (functionality)
- Comparison: Parallel computing
- Message passing methods
- RESt
- RPC
- Middleware (messageQ)
- Use cases
- External: Rest
- Internal to modules: RPC
- Between modules: Middleware, REST
- Fulfill specific requirements
Distributed Architecture vs. Microservice Architecture
Distributed: Guides how nodes communicate with each other
Microservices: Encourages dividing modules by business domain
Multi-tier Architecture vs. Microservice Architecture
Microservice architecture has more "services"
Microservices usually require automated testing, deployment, and service discovery
Distributed Web Crawler
- Rate limiting issues
- Limited traffic capacity for a single node
- Place workers on different nodes
- Deduplication issues
- Distributed deduplication places deduplication on workers
- Data storage issues
jsonRPC
package rpcdemo
import "errors"
// Service.Method
type DemoService struct {}
type Args struct {
A,B int
}
func (DemoService) Dive(args Args,result *float64) error {
if args.B==0 {
return errors.New("division by zero.")
}
*result = float64(args.A)/float64(args.B)
return nil
}
// server
package main
import (
rpcdemo "gobasic/rpc"
"log"
"net"
"net/rpc"
"net/rpc/jsonrpc"
)
func main() {
rpc.Register(rpcdemo.DemoService{})
listener, err := net.Listen("tcp", ":1234")
if err != nil {
panic(err)
}
for {
conn, err := listener.Accept()
if err != nil {
log.Printf("accept error: %v", err)
continue
}
go jsonrpc.ServeConn(conn)
}
}
You can test it with telnet localhost 1234, send JSON data as follows
{"method":"DemoService.Dive","params":[{"A":3,"B":4}],"id":1}
// 返回结果
{"id":1,"result":0.75,"error":null}
Create a client to call
// client/main.go
package main
import (
"fmt"
rpcdemo "gobasic/rpc"
"net"
"net/rpc/jsonrpc"
)
func main() {
conn, err := net.Dial("tcp", ":1234")
if err != nil {
panic(err)
}
client := jsonrpc.NewClient(conn)
var result float64
err = client.Call("DemoService.Dive", rpcdemo.Args{10, 3}, &result)
fmt.Println(result, err)
err = client.Call("DemoService.Dive", rpcdemo.Args{10, 0}, &result)
fmt.Println(result, err)
}
主题测试文章,只做测试使用。发布者:Walker,转转请注明出处:https://walker-learn.xyz/archives/6742
