分词计算频次代码-GO
作者:互联网
package main
import (
"fmt"
"github.com/huichen/sego"
"os"
"path/filepath"
"sort"
"strings"
"unicode/utf8"
)
type Pair struct {
Key string
Value int
}
// PariList实现了sort接口,可以使用sort.Sort对其排序
type PairList []Pair
func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func (p PairList) Len() int { return len(p) }
func (p PairList) Less(i, j int) bool { return p[j].Value < p[i].Value } // 逆序
func main() {
if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" {
fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]\n",
filepath.Base(os.Args[0]))
os.Exit(1)
}
// 载入词典
var segmenter sego.Segmenter
segmenter.LoadDictionary("../github.com/huichen/sego/data/dictionary.txt")
//读取文件内容到buf中
//filename := "./test2.txt"
filenameArray := os.Args[1:]
filename := filenameArray[0]
fp, err := os.Open(filename)
defer fp.Close()
if err != nil {
fmt.Println(filename, err)
return
}
buf := make([]byte, 409600)
n, _ := fp.Read(buf)
if n == 0 {
return
}
// 分词
segments := segmenter.Segment(buf)
// 处理分词结果
// 支持普通模式和搜索模式两种分词,见utils.go代码中SegmentsToString函数的注释。
// 如果需要词性标注,用SegmentsToString(segments, false),更多参考utils.go文件
output := sego.SegmentsToSlice(segments, false)
//fmt.Println(len(output))
var memory map[string]int
memory = make(map[string]int, 10000)
//输出分词后的成语
for _, s := range output {
/*if len(s) > 0 && fmt.Sprintf("%#v", s) != "\"\\x00\"" && fmt.Sprintf("%#v", s) != "\" \"" && fmt.Sprintf("%#v", s) != "\"\\n\"" {
fmt.Println(s)
//按照分词顺序重新排列文章,改为换行
}*/
if len(s) > utf8.UTFMax ||
utf8.RuneCountInString(s) > 1 {
//fmt.Println(s)
memory[strings.ToLower(s)] += 1
}
}
//fmt.Println(memory)
//统计排序
p := make(PairList, len(memory))
i := 0
for k, v := range memory { // 将wordcount map转换成PairList
p[i] = Pair{k, v}
i++
}
sort.Sort(p) // 因为PairList实现了排序接口,所以可以使用sort.Sort()对其排序
wordWidth, frequencyWidth := 0, 0
for _, pair := range p {
word, frequency := pair.Key, pair.Value
if width := utf8.RuneCountInString(word); width > wordWidth {
wordWidth = width
}
if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
frequencyWidth = width
}
}
gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")
for _, pair := range p {
fmt.Printf("%-*s %*d\n", wordWidth, pair.Key, frequencyWidth,
pair.Value)
}
}
执行命令
go build main.go
./main text.txt | head -n 10
标签:fmt,PairList,len,频次,width,memory,GO,os,分词 来源: https://blog.csdn.net/aichojie/article/details/123129955