Inverted Index for Queries
1. What is an Inverted Index?
An Inverted Index is a data structure used to quickly find documents containing specific terms. It is one of the core technologies of search engines.
1.1 Basic Concepts
- Forward Index: Document ID → Document Content (Term List)
- Inverted Index: Term → List of Document IDs containing that term
1.2 Why is it called "Inverted"?
An inverted index reverses the traditional relationship of "which terms a document contains" to "in which documents a term appears," hence the name "inverted."
2. Structure of an Inverted Index
2.1 Basic Structure
Term → Document Frequency → Document List
2.2 Detailed Structure
Term → {
DocFreq: N,
Postings: [
{DocID: 1, Frequency: 2, Positions: [0, 5]},
{DocID: 3, Frequency: 1, Positions: [2]}
]
}
3. How an Inverted Index Works
3.1 Building Process
- Document Preprocessing: Tokenization, stop word removal, stemming
- Term Statistics: Count the frequency and position of each term in documents
- Index Construction: Establish the mapping relationship from terms to documents
3.2 Query Process
- Query Parsing: Tokenize the query string
- Index Lookup: Look up each term in the inverted index
- Result Merging: Merge document lists for multiple terms
- Sorted Return: Sort by relevance and return results
4. Implementing an Inverted Index in Go
4.1 Data Structure Definition
package main
import (
"fmt"
"sort"
"strings"
)
// 文档信息
type Document struct {
ID int
Text string
}
// 词项在文档中的位置信息
type Posting struct {
DocID int
Frequency int
Positions []int
}
// 倒排索引项
type InvertedIndexItem struct {
Term string
DocFreq int
Postings []Posting
}
// 倒排索引
type InvertedIndex struct {
Index map[string]*InvertedIndexItem
}
// 创建新的倒排索引
func NewInvertedIndex() *InvertedIndex {
return &InvertedIndex{
Index: make(map[string]*InvertedIndexItem),
}
}
4.2 Index Construction
// 添加文档到索引
func (idx *InvertedIndex) AddDocument(docID int, text string) {
// 简单的分词(实际应用中需要更复杂的分词算法)
words := strings.Fields(strings.ToLower(text))
for pos, word := range words {
if idx.Index[word] == nil {
idx.Index[word] = &InvertedIndexItem{
Term: word,
DocFreq: 0,
Postings: make([]Posting, 0),
}
}
// 查找是否已存在该文档的posting
var posting *Posting
for i := range idx.Index[word].Postings {
if idx.Index[word].Postings[i].DocID == docID {
posting = &idx.Index[word].Postings[i]
break
}
}
if posting == nil {
// 创建新的posting
newPosting := Posting{
DocID: docID,
Frequency: 1,
Positions: []int{pos},
}
idx.Index[word].Postings = append(idx.Index[word].Postings, newPosting)
idx.Index[word].DocFreq++
} else {
// 更新现有posting
posting.Frequency++
posting.Positions = append(posting.Positions, pos)
}
}
}
4.3 Query Implementation
// 单词查询
func (idx *InvertedIndex) Search(term string) []int {
term = strings.ToLower(term)
if item, exists := idx.Index[term]; exists {
docIDs := make([]int, len(item.Postings))
for i, posting := range item.Postings {
docIDs[i] = posting.DocID
}
return docIDs
}
return []int{}
}
// 多词查询(AND操作)
func (idx *InvertedIndex) SearchAnd(terms []string) []int {
if len(terms) == 0 {
return []int{}
}
// 获取第一个词的结果
result := idx.Search(terms[0])
// 与其他词的结果求交集
for i := 1; i < len(terms); i++ {
otherResult := idx.Search(terms[i])
result = intersect(result, otherResult)
}
return result
}
// 多词查询(OR操作)
func (idx *InvertedIndex) SearchOr(terms []string) []int {
if len(terms) == 0 {
return []int{}
}
resultSet := make(map[int]bool)
for _, term := range terms {
docIDs := idx.Search(term)
for _, docID := range docIDs {
resultSet[docID] = true
}
}
result := make([]int, 0, len(resultSet))
for docID := range resultSet {
result = append(result, docID)
}
sort.Ints(result)
return result
}
// 求两个切片的交集
func intersect(a, b []int) []int {
set := make(map[int]bool)
for _, x := range a {
set[x] = true
}
result := make([]int, 0)
for _, x := range b {
if set[x] {
result = append(result, x)
}
}
return result
}
4.4 Complete Example
func main() {
// 创建倒排索引
index := NewInvertedIndex()
// 添加文档
documents := []Document{
{ID: 1, Text: "Go is a programming language"},
{ID: 2, Text: "Go is fast and efficient"},
{ID: 3, Text: "Programming in Go is fun"},
{ID: 4, Text: "Go language is simple"},
}
// 构建索引
for _, doc := range documents {
index.AddDocument(doc.ID, doc.Text)
}
// 查询示例
fmt.Println("搜索 'go':", index.Search("go"))
fmt.Println("搜索 'programming':", index.Search("programming"))
fmt.Println("搜索 'go' AND 'language':", index.SearchAnd([]string{"go", "language"}))
fmt.Println("搜索 'go' OR 'fast':", index.SearchOr([]string{"go", "fast"}))
// 打印索引结构
fmt.Println("\n倒排索引结构:")
for term, item := range index.Index {
fmt.Printf("词项: %s, 文档频率: %d\n", term, item.DocFreq)
for _, posting := range item.Postings {
fmt.Printf(" 文档ID: %d, 词频: %d, 位置: %v\n",
主题测试文章,只做测试使用。发布者:Walker,转转请注明出处:https://walker-learn.xyz/archives/4784