Go语言自然语言处理：文本处理与分析

张

张建站

2026/5/30 1:15:24

10分钟阅读

Go语言自然语言处理文本处理与分析引言自然语言处理NLP是人工智能的重要分支它使计算机能够理解、处理和生成人类语言。Go语言以其高性能和并发能力成为构建NLP应用的理想选择。本文将介绍如何使用Go语言进行自然语言处理。一、文本处理基础1.1 字符串操作package main import ( fmt strings unicode ) func main() { text : Hello, World! 你好世界 // 字符串长度 fmt.Printf(长度: %d\n, len(text)) // 转换为小写 fmt.Printf(小写: %s\n, strings.ToLower(text)) // 转换为大写 fmt.Printf(大写: %s\n, strings.ToUpper(text)) // 拆分字符串 words : strings.Fields(text) fmt.Printf(分词: %v\n, words) // 替换 replaced : strings.ReplaceAll(text, World, Go) fmt.Printf(替换后: %s\n, replaced) }1.2 Unicode处理package main import ( fmt unicode ) func main() { text : Hello 世界 123 ! // 遍历字符 for _, r : range text { fmt.Printf(%c - 类型: , r) switch { case unicode.IsLetter(r): fmt.Println(字母) case unicode.IsDigit(r): fmt.Println(数字) case unicode.IsSpace(r): fmt.Println(空格) default: fmt.Println(其他) } } }二、分词处理2.1 英文分词package main import ( fmt regexp strings ) func tokenize(text string) []string { // 移除标点符号 re : regexp.MustCompile([^\w\s]) cleaned : re.ReplaceAllString(text, ) // 转换为小写并分词 words : strings.Fields(strings.ToLower(cleaned)) return words } func main() { text : Hello, World! This is a test sentence. tokens : tokenize(text) fmt.Printf(分词结果: %v\n, tokens) }2.2 中文分词go get github.com/go-ego/gsepackage main import ( fmt github.com/go-ego/gse ) func main() { seg : gse.New() // 加载词典 err : seg.LoadDict(zh) if err ! nil { panic(err) } text : 我爱北京天安门 words : seg.Cut(text, true) fmt.Printf(中文分词结果: %v\n, words) }三、词频统计3.1 基础词频统计package main import ( fmt sort strings ) func wordFrequency(text string) map[string]int { words : strings.Fields(strings.ToLower(text)) freq : make(map[string]int) for _, word : range words { freq[word] } return freq } func sortByFrequency(freq map[string]int) []string { words : make([]string, 0, len(freq)) for word : range freq { words append(words, word) } sort.Slice(words, func(i, j int) bool { return freq[words[i]] freq[words[j]] }) return words } func main() { text : Hello world! Hello Go! Go is great. Go is fun. freq : wordFrequency(text) sortedWords : sortByFrequency(freq) fmt.Println(词频统计:) for _, word : range sortedWords { fmt.Printf(%s: %d\n, word, freq[word]) } }3.2 TF-IDF计算package main import ( fmt math ) func computeTF(term string, doc []string) float64 { count : 0 for _, word : range doc { if word term { count } } return float64(count) / float64(len(doc)) } func computeIDF(term string, docs [][]string) float64 { docCount : 0 for _, doc : range docs { for _, word : range doc { if word term { docCount break } } } return math.Log(float64(len(docs)) / float64(docCount1)) } func computeTFIDF(term string, doc []string, docs [][]string) float64 { tf : computeTF(term, doc) idf : computeIDF(term, docs) return tf * idf } func main() { docs : [][]string{ {hello, world, go}, {hello, go, lang}, {world, programming}, } term : go doc : docs[0] tfidf : computeTFIDF(term, doc, docs) fmt.Printf(TF-IDF for %s: %.4f\n, term, tfidf) }四、文本分类4.1 朴素贝叶斯分类器package main import ( fmt math ) type NaiveBayesClassifier struct { classCounts map[string]int wordCounts map[string]map[string]int totalDocuments int } func NewNaiveBayesClassifier() *NaiveBayesClassifier { return NaiveBayesClassifier{ classCounts: make(map[string]int), wordCounts: make(map[string]map[string]int), } } func (nb *NaiveBayesClassifier) Train(docs []string, labels []string) { for i, doc : range docs { label : labels[i] nb.classCounts[label] nb.totalDocuments if _, ok : nb.wordCounts[label]; !ok { nb.wordCounts[label] make(map[string]int) } words : strings.Fields(strings.ToLower(doc)) for _, word : range words { nb.wordCounts[label][word] } } } func (nb *NaiveBayesClassifier) Predict(doc string) string { words : strings.Fields(strings.ToLower(doc)) bestClass : bestScore : math.Inf(-1) for class : range nb.classCounts { score : math.Log(float64(nb.classCounts[class]) / float64(nb.totalDocuments)) for _, word : range words { wordCount : nb.wordCounts[class][word] score math.Log(float64(wordCount1) / float64(nb.classCounts[class]len(nb.wordCounts[class]))) } if score bestScore { bestScore score bestClass class } } return bestClass } func main() { nb : NewNaiveBayesClassifier() docs : []string{ I love this movie, Great film, highly recommend, Terrible movie, waste of time, Hated every minute of it, Excellent performance, Poor acting, bad script, } labels : []string{positive, positive, negative, negative, positive, negative} nb.Train(docs, labels) testDoc : This movie was amazing prediction : nb.Predict(testDoc) fmt.Printf(预测结果: %s\n, prediction) }五、文本生成5.1 马尔可夫链文本生成package main import ( fmt math/rand strings time ) type MarkovChain struct { transitions map[string][]string order int } func NewMarkovChain(order int) *MarkovChain { rand.Seed(time.Now().UnixNano()) return MarkovChain{ transitions: make(map[string][]string), order: order, } } func (mc *MarkovChain) Train(text string) { words : strings.Fields(strings.ToLower(text)) for i : 0; i len(words)-mc.order-1; i { key : strings.Join(words[i:imc.order], ) nextWord : words[imc.order] mc.transitions[key] append(mc.transitions[key], nextWord) } } func (mc *MarkovChain) Generate(length int) string { // 随机选择起始状态 keys : make([]string, 0, len(mc.transitions)) for key : range mc.transitions { keys append(keys, key) } if len(keys) 0 { return } current : keys[rand.Intn(len(keys))] result : strings.Split(current, ) for i : 0; i length-mc.order; i { nextWords : mc.transitions[current] if len(nextWords) 0 { break } nextWord : nextWords[rand.Intn(len(nextWords))] result append(result, nextWord) // 更新当前状态 current strings.Join(result[len(result)-mc.order:], ) } return strings.Join(result, ) } func main() { text : I love Go programming. Go is a great language. Go is fast and efficient. I love programming in Go. mc : NewMarkovChain(2) mc.Train(text) generated : mc.Generate(10) fmt.Printf(生成文本: %s\n, generated) }六、情感分析6.1 简单情感分析package main import ( fmt strings ) var positiveWords map[string]bool{ love: true, great: true, excellent: true, amazing: true, good: true, best: true, wonderful: true, fantastic: true, } var negativeWords map[string]bool{ hate: true, terrible: true, bad: true, awful: true, worst: true, poor: true, horrible: true, } func analyzeSentiment(text string) float64 { words : strings.Fields(strings.ToLower(text)) positiveCount : 0 negativeCount : 0 for _, word : range words { if positiveWords[word] { positiveCount } if negativeWords[word] { negativeCount } } total : positiveCount negativeCount if total 0 { return 0 } return float64(positiveCount-negativeCount) / float64(total) } func main() { texts : []string{ I love this movie, its amazing!, Terrible experience, hated it., It was okay, not great but not bad., } for _, text : range texts { score : analyzeSentiment(text) sentiment : 中性 if score 0.3 { sentiment 正面 } else if score -0.3 { sentiment 负面 } fmt.Printf(文本: %s\n情感得分: %.2f (%s)\n\n, text, score, sentiment) } }七、命名实体识别7.1 基于规则的NERpackage main import ( fmt regexp strings ) type Entity struct { Text string Type string Start int End int } func extractEntities(text string) []Entity { var entities []Entity // 匹配邮箱 emailRegex : regexp.MustCompile([\w.-][\w.-]\.\w) for _, match : range emailRegex.FindAllStringIndex(text, -1) { entities append(entities, Entity{ Text: text[match[0]:match[1]], Type: EMAIL, Start: match[0], End: match[1], }) } // 匹配电话号码 phoneRegex : regexp.MustCompile(\d{3,4}[-.]?\d{4}[-.]?\d{4}) for _, match : range phoneRegex.FindAllStringIndex(text, -1) { entities append(entities, Entity{ Text: text[match[0]:match[1]], Type: PHONE, Start: match[0], End: match[1], }) } // 匹配网址 urlRegex : regexp.MustCompile(https?://[\w.-](?:/[\w./-]*)?) for _, match : range urlRegex.FindAllStringIndex(text, -1) { entities append(entities, Entity{ Text: text[match[0]:match[1]], Type: URL, Start: match[0], End: match[1], }) } return entities } func main() { text : 联系我们: supportexample.com 或拨打 123-4567-8900 更多信息请访问 https://www.example.com/products entities : extractEntities(text) fmt.Println(提取的实体:) for _, entity : range entities { fmt.Printf(类型: %s, 文本: %s, 位置: [%d-%d]\n, entity.Type, entity.Text, entity.Start, entity.End) } }八、文本相似度8.1 余弦相似度package main import ( fmt math strings ) func tokenize(text string) map[string]int { words : strings.Fields(strings.ToLower(text)) freq : make(map[string]int) for _, word : range words { freq[word] } return freq } func dotProduct(v1, v2 map[string]int) int { sum : 0 for word, count : range v1 { sum count * v2[word] } return sum } func magnitude(v map[string]int) float64 { sum : 0 for _, count : range v { sum count * count } return math.Sqrt(float64(sum)) } func cosineSimilarity(text1, text2 string) float64 { v1 : tokenize(text1) v2 : tokenize(text2) dot : dotProduct(v1, v2) mag1 : magnitude(v1) mag2 : magnitude(v2) if mag1 0 || mag2 0 { return 0 } return float64(dot) / (mag1 * mag2) } func main() { text1 : I love programming in Go text2 : Go is a great programming language text3 : Cats are cute animals similarity12 : cosineSimilarity(text1, text2) similarity13 : cosineSimilarity(text1, text3) fmt.Printf(文本1与文本2相似度: %.4f\n, similarity12) fmt.Printf(文本1与文本3相似度: %.4f\n, similarity13) }九、实战文本搜索引擎package main import ( fmt sort strings ) type Document struct { ID int Title string Body string } type SearchEngine struct { documents []Document index map[string][]int } func NewSearchEngine() *SearchEngine { return SearchEngine{ documents: make([]Document, 0), index: make(map[string][]int), } } func (se *SearchEngine) AddDocument(doc Document) { se.documents append(se.documents, doc) docID : len(se.documents) - 1 words : strings.Fields(strings.ToLower(doc.Title doc.Body)) seen : make(map[string]bool) for _, word : range words { if !seen[word] { se.index[word] append(se.index[word], docID) seen[word] true } } } func (se *SearchEngine) Search(query string) []Document { queryWords : strings.Fields(strings.ToLower(query)) // 找到包含所有查询词的文档 var resultIDs []int for i, word : range queryWords { if docIDs, ok : se.index[word]; ok { if i 0 { resultIDs docIDs } else { // 求交集 resultIDs intersect(resultIDs, docIDs) } } else { return []Document{} } } // 获取文档 results : make([]Document, 0, len(resultIDs)) for _, id : range resultIDs { results append(results, se.documents[id]) } return results } func intersect(a, b []int) []int { result : make([]int, 0) i, j : 0, 0 for i len(a) j len(b) { if a[i] b[j] { result append(result, a[i]) i j } else if a[i] b[j] { i } else { j } } return result } func main() { se : NewSearchEngine() se.AddDocument(Document{Title: Go Programming, Body: Go is a programming language created by Google}) se.AddDocument(Document{Title: Machine Learning, Body: Machine learning is a subset of AI}) se.AddDocument(Document{Title: Go and AI, Body: Go can be used for AI and machine learning}) results : se.Search(Go programming) fmt.Println(搜索结果:) for _, doc : range results { fmt.Printf(标题: %s\n内容: %s\n\n, doc.Title, doc.Body) } }十、总结本文介绍了如何使用Go语言进行自然语言处理包括文本处理基础字符串操作、Unicode处理分词处理英文分词和中文分词词频统计基础词频和TF-IDF计算文本分类朴素贝叶斯分类器文本生成马尔可夫链文本生成情感分析基于词典的情感分析命名实体识别基于规则的NER文本相似度余弦相似度计算实战项目简单文本搜索引擎通过这些实现你可以使用Go语言构建各种NLP应用充分利用Go的性能优势处理大规模文本数据。

PEExplorerV2终极指南：快速掌握Windows可执行文件分析的免费神器

PEExplorerV2终极指南：快速掌握Windows可执行文件分析的免费神器【免费下载链接】PEExplorerV2 Portable Executable Explorer version 2 项目地址: https://gitcode.com/gh_mirrors/pe/PEExplorerV2 PEExplorerV2是一款功能强大的Windows可执行文件分析工具…...

2026/5/30 1:15:11 阅读更多 →

“前后端已死” 背后，Web2 开发者在焦虑什么？

进入 2026 年，关于 “前端要没了” “后端也快被替代了” “程序员是不是要失业了” 的讨论几乎没停过。AI 让效率翻倍，也有人开始担心，自己会在不知不觉中被时代甩下。这次，TinTinLand 找到了三位来自 Web2/ 传统 IT 行业的开发者…...

2026/5/30 1:12:59 阅读更多 →

解码韬定律：从τ缩微到衡真旋体系的本源同构验证

解码韬定律：从τ缩微到衡真旋体系的本源同构验证 ——华为六年工程突破，印证通用宇宙底层法则作者： 华夏之光永存 / 九天应元雷声普化天尊文章信息来源： 经典依据：《九天应元雷声普化天尊玉枢宝经》本源依据&#x…...

2026/5/30 1:10:12 阅读更多 →

【限时解密】Claude 3.5 Sonnet专属编程模式：仅开放给前500家企业的上下文感知补全协议

更多请点击： https://kaifayun.com 第一章：Claude 3.5 Sonnet编程辅助的核心能力边界与适用场景 Claude 3.5 Sonnet 在编程辅助领域展现出显著的推理深度与上下文理解能力，但其本质仍是基于大规模语言模型的生成式系统，不具备实时…...

2026/5/28 15:08:49 阅读更多 →

RMAN 增量备份（Incremental Backup）

1、概念RMAN 增量备份是指 RMAN 只备份自上次备份以来发生过更改的数据块，而不是备份整个数据库的所有数据块。它是 Oracle 为解决大型数据库全量备份时间长、占用空间大的问题而设计的核心特性，也是现代企业级备份策略的基础。简单类比：全库…...

2026/5/27 0:57:50 阅读更多 →

终极指南：掌握ProperTree跨平台Plist编辑器的10个高效技巧

终极指南：掌握ProperTree跨平台Plist编辑器的10个高效技巧【免费下载链接】ProperTree Cross platform GUI plist editor written in python. 项目地址: https://gitcode.com/gh_mirrors/pr/ProperTree 想要轻松编辑macOS和iOS的配置文件却苦于复杂的XML语法…...

2026/5/27 16:46:38 阅读更多 →

ScriptHookV解决方案：如何安全扩展GTA V游戏功能而不修改原始文件

ScriptHookV解决方案：如何安全扩展GTA V游戏功能而不修改原始文件【免费下载链接】ScriptHookV An open source hook into GTAV for loading offline mods 项目地址: https://gitcode.com/gh_mirrors/sc/ScriptHookV ScriptHookV是一个专为《侠盗猎车手V》&…...

2026/5/27 17:17:05 阅读更多 →