TODO: wordIndexUpperLower 原因: 边界

2019-08-19 19:03:58 +08:00 · 2019-08-19 19:03:58 +08:00 · 5a7a4f2c92
commit 5a7a4f2c92
parent b4829ba058
3 changed files with 251 additions and 41 deletions
--- a/tree/tried/tried.go
+++ b/tree/tried/tried.go
@ -1,15 +1,5 @@
 package tried

-type TriedString string
-
-func (ts TriedString) Size() uint {
-	return uint(len(ts))
-}
-
-func (ts TriedString) WordIndex(idx uint) uint {
-	return uint(ts[idx]) - 'a'
-}
-
 // func (ts TriedString) WordIndex(idx uint) uint {
 // 	w := ts[idx]
 // 	if w >= 'a' && w <= 'z' {
@ -21,14 +11,9 @@ func (ts TriedString) WordIndex(idx uint) uint {
 // 	}
 // }

-type ObjectIndex interface {
-	WordIndex(idx uint) uint
-	Size() uint
-}
-
 type Tried struct {
-	root     *Node
-	datasize uint
+	root    *Node
+	wiStore *wordIndexStore
 }

 type Node struct {
@ -36,25 +21,34 @@ type Node struct {
 	value interface{}
 }

+// New 默认 WordIndexLower 意味着只支持小写
 func New() *Tried {
 	tried := &Tried{}
 	tried.root = new(Node)
-	tried.datasize = 62
+
+	tried.wiStore = WordIndexDict[WordIndexLower]
 	return tried
 }

-func (tried *Tried) wordIndex(w byte) uint {
-	return uint(w) - 'a'
+// NewWithWordType 选择单词的类型 WordIndexLower 意味着只支持小写
+func NewWithWordType(t WordIndexType) *Tried {
+	tried := &Tried{}
+	tried.root = new(Node)
+
+	tried.wiStore = WordIndexDict[t]
+
+	return tried
 }

 func (tried *Tried) Put(words string, values ...interface{}) {
 	cur := tried.root
 	var n *Node
+
 	for i := 0; i < len(words); i++ {
-		w := tried.wordIndex(words[i])
+		w := tried.wiStore.Byte2Index(words[i])

 		if cur.data == nil {
-			cur.data = make([]*Node, tried.datasize)
+			cur.data = make([]*Node, tried.wiStore.DataSize)
 		}

 		if n = cur.data[w]; n == nil {
@ -81,8 +75,9 @@ func (tried *Tried) Put(words string, values ...interface{}) {
 func (tried *Tried) Get(words string) interface{} {
 	cur := tried.root
 	var n *Node
+
 	for i := 0; i < len(words); i++ {
-		w := tried.wordIndex(words[i]) //TODO: 升级Index 函数
+		w := tried.wiStore.Byte2Index(words[i]) //TODO: 升级Index 函数
 		if n = cur.data[w]; n == nil {
 			return nil
 		}
--- a/tree/tried/tried_index.go
+++ b/tree/tried/tried_index.go
@ -0,0 +1,152 @@
+package tried
+
+var WordIndexDict map[WordIndexType]*wordIndexStore
+
+func init() {
+	WordIndexDict = make(map[WordIndexType]*wordIndexStore)
+	WordIndexDict[WordIndexLower] = &wordIndexStore{WordIndexLower, wordIndexLower, indexWordLower, 26}
+	WordIndexDict[WordIndexUpper] = &wordIndexStore{WordIndexUpper, wordIndexUpper, indexWordUpper, 26}
+	WordIndexDict[WordIndexDigital] = &wordIndexStore{WordIndexDigital, wordIndexDigital, indexWordDigital, 10}
+	WordIndexDict[WordIndexUpperLower] = &wordIndexStore{WordIndexUpperLower, wordIndexUpperLower, indexWordUpperLower, 52}
+	WordIndexDict[WordIndexLowerDigital] = &wordIndexStore{WordIndexLowerDigital, wordIndexLowerDigital, indexWordLowerDigital, 36}
+	WordIndexDict[WordIndexUpperDigital] = &wordIndexStore{WordIndexUpperDigital, wordIndexUpperDigital, indexWordUpperDigital, 36}
+	WordIndexDict[WordIndexUpperLowerDigital] = &wordIndexStore{WordIndexUpperLowerDigital, wordIndexUpperLowerDigital, indexWordUpperLowerDigital, 62}
+	WordIndexDict[WordIndex256] = &wordIndexStore{WordIndex256, wordIndex256, indexWord256, 256}
+	WordIndexDict[WordIndex32to126] = &wordIndexStore{WordIndex32to126, wordIndex32to126, indexWord32to126, ('~' - ' ' + 1)}
+}
+
+// WordIndexType 单词统计的类型 eg. WordIndexLower 意味Put的单词只支持小写...
+type WordIndexType int
+
+const (
+	_ WordIndexType = iota
+	WordIndexLower
+	WordIndexUpper
+	WordIndexDigital
+	WordIndexUpperLower
+	WordIndexLowerDigital
+	WordIndexUpperDigital
+	WordIndexUpperLowerDigital
+	WordIndex256
+	WordIndex32to126
+)
+
+type wordIndexStore struct {
+	Type       WordIndexType
+	Byte2Index func(byte) uint
+	Index2Byte func(uint) byte
+	DataSize   uint
+}
+
+func wordIndexLower(w byte) uint {
+	return uint(w) - 'a'
+}
+
+func indexWordLower(w uint) byte {
+	return byte(w) + 'a'
+}
+
+//
+func wordIndexUpper(w byte) uint {
+	return uint(w) - 'A'
+}
+
+func indexWordUpper(w uint) byte {
+	return byte(w) + 'A'
+}
+
+//
+func wordIndexDigital(w byte) uint {
+	return uint(w) - '0'
+}
+
+func indexWordDigital(w uint) byte {
+	return byte(w) + '0'
+}
+
+//
+func wordIndexUpperLower(w byte) uint {
+	iw := uint(w)
+	if iw > 'a' {
+		return iw - 'a'
+	}
+	return iw - 'A' + 26
+}
+
+func indexWordUpperLower(w uint) byte {
+
+	if w >= 26 {
+		return byte(w) + 'A'
+	}
+	return byte(w) + 'a'
+}
+
+//
+func wordIndexLowerDigital(w byte) uint {
+	iw := uint(w)
+	if iw > 'a' {
+		return iw - 'a'
+	}
+	return iw - '0' + 26
+}
+
+func indexWordLowerDigital(w uint) byte {
+	if w >= 26 {
+		return byte(w) + '0'
+	}
+	return byte(w) + 'a'
+}
+
+//
+func wordIndexUpperDigital(w byte) uint {
+	iw := uint(w)
+	if iw > 'A' {
+		return iw - 'A'
+	}
+	return iw - '0' + 26
+}
+
+func indexWordUpperDigital(w uint) byte {
+	if w >= 26 {
+		return byte(w) + '0'
+	}
+	return byte(w) + 'a'
+}
+
+//
+func wordIndexUpperLowerDigital(w byte) uint {
+	iw := uint(w)
+	if iw > 'a' {
+		return iw - 'a'
+	} else if iw > 'A' {
+		return iw - 'A' + 26
+	}
+	return iw - '0' + 52
+}
+
+func indexWordUpperLowerDigital(w uint) byte {
+	if w >= 52 {
+		return byte(w) + '0'
+	} else if w >= 26 {
+		return byte(w) + 'A'
+	}
+	return byte(w) + 'a'
+}
+
+// wordIndex256 all byte
+func wordIndex256(w byte) uint {
+	return uint(w)
+}
+
+func indexWord256(w uint) byte {
+	return byte(w)
+}
+
+// wordIndex32to126 空格-~ 0-9 a-z A-Z 符号等
+func wordIndex32to126(w byte) uint {
+	return uint(w) - ' '
+}
+
+func indexWord32to126(w uint) byte {
+	return byte(w) + ' '
+}
--- a/tree/tried/tried_test.go
+++ b/tree/tried/tried_test.go
@ -1,11 +1,44 @@
 package tried

 import (
+	"bytes"
+	"encoding/gob"
+	"os"
 	"testing"

 	"github.com/Pallinder/go-randomdata"
 )

+func TestTried_NewWith(t *testing.T) {
+	tried := NewWithWordType(WordIndex32to126)
+	words := "~ 23fd "
+	tried.Put(words)
+	if tried.Get(words) == nil {
+		t.Error("should be not nil")
+	}
+
+	tried = NewWithWordType(WordIndexLower)
+	words = "az"
+	tried.Put(words)
+	if tried.Get(words) == nil {
+		t.Error("should be not nil")
+	}
+
+	tried = NewWithWordType(WordIndexUpper)
+	words = "AZ"
+	tried.Put(words)
+	if tried.Get(words) == nil {
+		t.Error("should be not nil")
+	}
+
+	tried = NewWithWordType(WordIndexUpperLower)
+	words = "AZazsdfsd"
+	tried.Put(words)
+	if tried.Get(words) == nil {
+		t.Error("should be not nil")
+	}
+}
+
 func TestTried_PutAndGet1(t *testing.T) {
 	tried := New()

@ -72,20 +105,49 @@ func TestTried_Traversal(t *testing.T) {
 	}
 }

+func TesStoreData(t *testing.T) {
+	var l []string
+	const N = 1000000
+	for i := 0; i < N; i++ {
+		var content []rune
+		for c := 0; c < randomdata.Number(5, 15); c++ {
+			char := randomdata.Number(0, 26) + 'a'
+			content = append(content, rune(byte(char)))
+		}
+		l = append(l, (string(content)))
+	}
+
+	var result bytes.Buffer
+	encoder := gob.NewEncoder(&result)
+	encoder.Encode(l)
+	lbytes := result.Bytes()
+	f, _ := os.OpenFile("tried.log", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666)
+	f.Write(lbytes)
+}
+
+func Load() []string {
+	var result []string
+	f, _ := os.Open("tried.log")
+	gob.NewDecoder(f).Decode(&result)
+	return result
+}
+
 func BenchmarkTried_Put(b *testing.B) {

 	var data []string
 	b.N = 1000000
 	count := 10

-	for i := 0; i < b.N; i++ {
-		var content []rune
-		for c := 0; c < randomdata.Number(5, 15); c++ {
-			char := randomdata.Number(0, 26) + 'a'
-			content = append(content, rune(byte(char)))
-		}
-		data = append(data, (string(content)))
-	}
+	// for i := 0; i < b.N; i++ {
+	// 	var content []rune
+	// 	for c := 0; c < randomdata.Number(5, 15); c++ {
+	// 		char := randomdata.Number(0, 26) + 'a'
+	// 		content = append(content, rune(byte(char)))
+	// 	}
+	// 	data = append(data, (string(content)))
+	// }
+
+	data = Load()

 	b.ResetTimer()
 	b.N = b.N * count
@ -98,19 +160,20 @@ func BenchmarkTried_Put(b *testing.B) {
 }

 func BenchmarkTried_Get(b *testing.B) {
-
+	b.StopTimer()
 	var data []string
 	b.N = 1000000
 	count := 10

-	for i := 0; i < b.N; i++ {
-		var content []rune
-		for c := 0; c < randomdata.Number(5, 15); c++ {
-			char := randomdata.Number(0, 26) + 'a'
-			content = append(content, rune(byte(char)))
-		}
-		data = append(data, string(content))
-	}
+	// for i := 0; i < b.N; i++ {
+	// 	var content []rune
+	// 	for c := 0; c < randomdata.Number(5, 15); c++ {
+	// 		char := randomdata.Number(0, 26) + 'a'
+	// 		content = append(content, rune(byte(char)))
+	// 	}
+	// 	data = append(data, string(content))
+	// }
+	data = Load()

 	b.N = b.N * count

@ -119,7 +182,7 @@ func BenchmarkTried_Get(b *testing.B) {
 		tried.Put(v)
 	}

-	b.ResetTimer()
+	b.StartTimer()
 	for c := 0; c < count; c++ {
 		for _, v := range data {
 			tried.Get(v)