萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想)。本站提示廣大學習愛好者:(萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想))文章只能為提供參考,不一定能成為您想要的結果。以下是萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想)正文
萌新做詞典第三篇,做得不好,還請指正,謝謝大佬!
明天把詞典的聯想做好了,也是比擬low的,還改了之前的查詢、遍歷等代碼。 Orz
一樣地先放上運轉後果:
1 test1 2 ID : 2 char : 件 word : 編程軟件 3 ID : 3 char : 習 word : 編程學習 4 ID : 4 char : 站 word : 編程學習網站 5 ID : 1 char : 門 word : 編程入門 6 7 test2 8 ID : 5 char : 練 word : 編程訓練 9 ID : 1 char : 門 word : 編程入門 10 ID : 3 char : 習 word : 編程學習 11 ID : 4 char : 站 word : 編程學習網站 12 ID : 2 char : 件 word : 編程軟件 13 find ID : 3 word : 編程學習 14 15 associate "編程" : 16 find! 17 訓練 18 入門 19 學習 20 學習網站 21 軟件
測試用的test.cc
1 #include "Dictionary.h"
2 #include <iostream>
3 #include <string>
4 #include <vector>
5 using std::cout;
6 using std::endl;
7 using std::string;
8 using std::vector;
9
10 int test1()
11 {
12 ccx::Dictionary words;
13 string word1 = "編程入門";
14 string word2 = "編程軟件";
15 string word3 = "編程學習";
16 string word4 = "編程學習網站";
17
18 words.push(word1);
19 words.push(word2);
20 words.push(word3);
21 words.push(word4);
22
23 words.resetIt();
24
25 while(!words.isEnd())
26 {
27 cout << "ID : " << words.getCurWordId()
28 << "\tchar : " << words.getCurChar()
29 << "\tword : " << words.getCurWord() << endl;
30 words.next();
31 }
32
33 words.leading_out();
34 return 0;
35 }
36
37
38 int test2()
39 {
40 ccx::Dictionary words;
41 words.leading_in();
42
43
44 string word("編程訓練");
45 words.push(word);
46 words.resetIt();
47
48 while(!words.isEnd())
49 {
50 cout << "ID : " << words.getCurWordId()
51 << "\tchar : " << words.getCurChar()
52 << "\tword : " << words.getCurWord() << endl;
53 words.next();
54 }
55 string tmp = "編程學習";
56 int id = words.search(tmp);
57 if(-1 == id)
58 {
59 cout << "no such word like \"" << tmp << "\"" << endl;
60 }else{
61 cout << "find ID : " << id
62 << "\tword : " << tmp << endl;
63 }
64
65 cout << endl;
66 cout << "associate \"編程\" : " << endl;
67
68 vector<string> data;
69 string temp = "編程";
70
71 if(words.associate(temp, data))
72 {
73 cout << "find!" << endl;
74 for(auto & elem : data)
75 {
76 cout << elem << endl;
77 }
78 }else{
79 cout << "can't find" << endl;
80 }
81
82
83 return 0;
84 }
85
86 int main()
87 {
88 cout << "test1" << endl;
89 test1();
90 cout << endl;
91 cout << "test2" << endl;
92 test2();
93 cout << endl;
94 }
View Code
test1不變,test2 在導入後再拔出一個詞“編程訓練”,發現ID是正常的。
然後在test2最後調用聯想函數,傳入“編程”,可以正常傳出一切的字符串。
在做這個的時分,一開端想的很復雜,就是拿傳入的詞去樹中查找,找到最後一人字對應的節點,然後以那個節點為根停止遍歷。然後就開開心心腸去寫了,後果寫一局部就要對之前的代碼停止更改,於是,這個接口越來越“肥”了:
Dictionary.h
1 #ifndef __DICTIONARY_H__
2 #define __DICTIONARY_H__
3
4 #include "DictionaryData.h"
5 #include "DictionaryConf.h"
6
7 #include <memory>
8 #include <vector>
9 #include <list>
10
11 namespace ccx{
12
13 using std::shared_ptr;
14 using std::vector;
15 using std::list;
16
17 class Dictionary
18 {
19 typedef unordered_map<string, pDictElem>::iterator WordIt;
20 public:
21 Dictionary();
22 void push(const string & word);
23 void push(vector<string> & words);
24 int search(const string & word);
25 bool associate(const string & word, vector<string> & data);
26 private:
27 void AddWord(const string & word, int wordId);
28 void splitWord(const string & word, vector<string> & characters);//把詞拆成字
29 int search(vector<string> & data, pDictElem & pcur);
30 pDictElem _dictionary;
31 DictionaryConf _conf;
32
33 //遍歷
34 public:
35 string getCurChar();
36 string getCurWord();
37 int getCurWordId();
38 bool isEnd();
39 void resetIt();
40 void next();
41 private:
42 void resetPoint(pDictElem pcur);
43 void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
44 void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
45 string getCurWord(list<WordIt> & stackWord);
46
47 pDictElem _pcur;
48 WordIt _itcur;
49
50 //用list完成棧,遍歷時方便
51 list<WordIt> _stackWord;
52 list<pDictElem> _stackDict;
53
54 //導入導出
55 public:
56 void leading_in();
57 void leading_out();
58 };
59
60 }
61
62 #endif
對幾個原有的函數停止了重載,次要是為了可以復用一些代碼,但是又想不到適宜的新的函數名(英語不太好Orz)。
首先,是要可以查找並前往新的根結點,於是對search停止修正:
1 int Dictionary::search(vector<string> & characters, pDictElem & root)
2 {
3 vector<string>::iterator it_char;
4 it_char = characters.begin();
5 root = _dictionary;
6 int i = 0;
7 for(; it_char != characters.end(); ++it_char, ++i)
8 {
9 WordIt it_word;
10 it_word = root->_words.find(*it_char);
11
12 if(it_word == root->_words.end())
13 {
14 break;
15 }else{
16 root = it_word->second;
17 }
18 }
19 return i;
20 }
形參第一項是分解後的字集,第二項是一個智能指針,指向某個節點。這裡前往值改為了字集的第幾項,有兩個目的:
1、拔出函數中可以方便地知道下一個要拔出的是哪個字符
2、聯想函數中可以判別字集中的字能否都存在於詞典中
3、好吧,我沒想到其它好方法,而且事先是想到下面兩點就這麼做了,後來發現,拔出局部的代碼基本就不必改
然後是重載search:
1 int Dictionary::search(const string & word)
2 {
3 pDictElem root = _dictionary;
4 vector<string> temp;
5 splitWord(word, temp);
6
7 int ret = search(temp, root);
8 int size = temp.size();
9 if(ret != size)
10 {
11 return -1;
12 }
13 return root->_wordId;
14 }
在這裡對字停止分解,並定義一個暫時的根結點,這樣做的目的是為了維護private中的根結點,並且可以在多線程環境中互不攪擾。
可以找到“新的根”後,就要對它停止遍歷了。假如只要單一線程或進程來運用它,這裡可以直接把resetPoint(原來的)修正一下,設置指定結點就可以了:
1 void Dictionary::resetPoint(pDictElem pcur)
2 {
3 _pcur = pcur;
4 if(_stackDict.size())
5 {
6 _stackDict.clear();
7 }
8 if(_stackWord.size())
9 {
10 _stackWord.clear();
11 }
12 next();
13 }
假如是這樣,那後面也完全不必修正。由於這個詞典最後是要使用到miniSearchEngin中,於是我對遍歷局部的函數停止了修正:
1 void Dictionary::next()
2 {
3 next(_pcur, _stackWord, _stackDict);
4 }
5
6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
7 {
8 while(pcur)
9 {
10 nextWord(pcur, stackWord, stackDict);
11 if(!pcur || pcur->_wordId)
12 {
13 break;
14 }
15 }
16 }
17
18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
19 {
20 if(pcur)
21 {
22 if(pcur->_words.size())
23 {
24 stackDict.push_back(pcur);
25 stackWord.push_back(pcur->_words.begin());
26 pcur = stackWord.back()->second;
27 }else{
28 ++(stackWord.back());
29 }
30 while(stackWord.back() == stackDict.back()->_words.end())
31 {
32 stackDict.pop_back();
33 stackWord.pop_back();
34 if(!stackDict.size())
35 {
36 pcur = NULL;
37 }
38 ++(stackWord.back());
39 }
40 if(pcur)
41 {
42 pcur = stackWord.back()->second;
43 }
44 }
45 }
next局部,改為傳入參數,這樣就可以在associate裡定義暫時的棧和智能指針等,遍歷的時分與其它任務並沒有任何關系。
異樣地,getWord也要做相反的更改:
1 string Dictionary::getCurWord()
2 {
3 return getCurWord(_stackWord);
4 }
5
6 string Dictionary::getCurWord(list<WordIt> & stackWord)
7 {
8 string temp;
9 list<WordIt>::iterator it_word;
10 it_word = stackWord.begin();
11
12 for(; it_word != stackWord.end(); ++it_word)
13 {
14 temp += (*it_word)->first;
15 }
16 return temp;
17 }
當然了,對外提供的接口都是不要傳參的,其它的只能在外部運用,於是放入了private區。
終於可以開端寫聯想了0.0
1 bool Dictionary::associate(const string & word, vector<string> & data)
2 {
3 pDictElem root = _dictionary;
4 vector<string> temp;
5 splitWord(word, temp);
6
7 int ret = search(temp, root);
8 int size = temp.size();
9 if(ret != size)
10 {
11 return false;
12 }
13
14 list<WordIt> stackWord;
15 list<pDictElem> stackDict;
16 next(root, stackWord, stackDict);
17 while(root)
18 {
19 string temp = getCurWord(stackWord);
20 data.push_back(temp);
21 next(root, stackWord, stackDict);
22 }
23
24 if(!data.size())
25 {
26 return false;
27 }
28 return true;
29 }
前往bool類型,可以方便地判別能否聯想成功,即以傳入的詞做為前綴,能否找到剩余局部(詞典裡有存)。於是乎,一個渣渣型號的詞典就做好啦~~~
Dictionary.cc
1 #include "Dictionary.h"
2 #include <iostream>
3 #include <fstream>
4 #include <string>
5 #include <json/json.h>
6
7 namespace ccx{
8
9 using std::endl;
10 using std::cout;
11 using std::pair;
12 using std::ofstream;
13 using std::ifstream;
14
15 Dictionary::Dictionary()
16 : _dictionary(new DictElem)
17 , _conf()
18 {
19 _dictionary->_wordId = 0;
20 _pcur = _dictionary;
21 }
22
23 void Dictionary::splitWord(const string & word, vector<string> & characters)
24 {
25 int num = word.size();
26 int i = 0;
27 while(i < num)
28 {
29 int size = 1;
30 if(word[i] & 0x80)
31 {
32 char temp = word[i];
33 temp <<= 1;
34 do{
35 temp <<= 1;
36 ++size;
37 }while(temp & 0x80);
38 }
39 string subWord;
40 subWord = word.substr(i, size);
41 characters.push_back(subWord);
42 i += size;
43 }
44 }
45
46 void Dictionary::AddWord(const string & word, int wordId)
47 {
48 vector<string> characters;
49 splitWord(word, characters);
50
51 vector<string>::iterator it_char;
52 it_char = characters.begin();
53 pDictElem root;
54 root = _dictionary;
55 for(; it_char != characters.end(); ++it_char)
56 {
57 WordIt it_word;
58 it_word = root->_words.find(*it_char);
59
60 if(it_word == root->_words.end())
61 {
62 pair<string, pDictElem> temp;
63 temp.first = *it_char;
64 pDictElem dictemp(new DictElem);
65 dictemp->_word = *it_char;
66 dictemp->_wordId = 0;
67 temp.second = dictemp;
68 root->_words.insert(temp);
69 root = dictemp;
70 }else{
71 root = it_word->second;
72 }
73 }
74 if(!root->_wordId)
75 {
76 root->_wordId = wordId;
77 }
78 }
79
80 void Dictionary::push(const string & word)
81 {
82 ++(_dictionary->_wordId);
83 AddWord(word, _dictionary->_wordId);
84 }
85
86 void Dictionary::push(vector<string> & words)
87 {
88 int size = words.size();
89 for(int i = 0; i < size; ++i)
90 {
91 push(words[i]);
92 }
93 }
94
95 int Dictionary::search(const string & word)
96 {
97 pDictElem root = _dictionary;
98 vector<string> temp;
99 splitWord(word, temp);
100
101 int ret = search(temp, root);
102 int size = temp.size();
103 if(ret != size)
104 {
105 return -1;
106 }
107 return root->_wordId;
108 }
109
110 int Dictionary::search(vector<string> & characters, pDictElem & root)
111 {
112 vector<string>::iterator it_char;
113 it_char = characters.begin();
114 root = _dictionary;
115 int i = 0;
116 for(; it_char != characters.end(); ++it_char, ++i)
117 {
118 WordIt it_word;
119 it_word = root->_words.find(*it_char);
120
121 if(it_word == root->_words.end())
122 {
123 break;
124 }else{
125 root = it_word->second;
126 }
127 }
128 return i;
129 }
130
131 bool Dictionary::associate(const string & word, vector<string> & data)
132 {
133 pDictElem root = _dictionary;
134 vector<string> temp;
135 splitWord(word, temp);
136
137 int ret = search(temp, root);
138 int size = temp.size();
139 if(ret != size)
140 {
141 return false;
142 }
143
144 list<WordIt> stackWord;
145 list<pDictElem> stackDict;
146 next(root, stackWord, stackDict);
147 while(root)
148 {
149 string temp = getCurWord(stackWord);
150 data.push_back(temp);
151 next(root, stackWord, stackDict);
152 }
153
154 if(!data.size())
155 {
156 return false;
157 }
158 return true;
159 }
160
161 //遍歷用
162
163 void Dictionary::resetPoint(pDictElem pcur)
164 {
165 _pcur = pcur;
166 if(_stackDict.size())
167 {
168 _stackDict.clear();
169 }
170 if(_stackWord.size())
171 {
172 _stackWord.clear();
173 }
174 next();
175 }
176
177 void Dictionary::resetIt()
178 {
179 resetPoint(_dictionary);
180 }
181
182 void Dictionary::next()
183 {
184 next(_pcur, _stackWord, _stackDict);
185 }
186
187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
188 {
189 while(pcur)
190 {
191 nextWord(pcur, stackWord, stackDict);
192 if(!pcur || pcur->_wordId)
193 {
194 break;
195 }
196 }
197 }
198
199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
200 {
201 if(pcur)
202 {
203 if(pcur->_words.size())
204 {
205 stackDict.push_back(pcur);
206 stackWord.push_back(pcur->_words.begin());
207 pcur = stackWord.back()->second;
208 }else{
209 ++(stackWord.back());
210 }
211 while(stackWord.back() == stackDict.back()->_words.end())
212 {
213 stackDict.pop_back();
214 stackWord.pop_back();
215 if(!stackDict.size())
216 {
217 pcur = NULL;
218 }
219 ++(stackWord.back());
220 }
221 if(pcur)
222 {
223 pcur = stackWord.back()->second;
224 }
225 }
226 }
227
228 string Dictionary::getCurChar()
229 {
230 return _pcur->_word;
231 }
232
233 int Dictionary::getCurWordId()
234 {
235 return _pcur->_wordId;
236 }
237
238 string Dictionary::getCurWord()
239 {
240 return getCurWord(_stackWord);
241 }
242
243 string Dictionary::getCurWord(list<WordIt> & stackWord)
244 {
245 string temp;
246 list<WordIt>::iterator it_word;
247 it_word = stackWord.begin();
248
249 for(; it_word != stackWord.end(); ++it_word)
250 {
251 temp += (*it_word)->first;
252 }
253 return temp;
254 }
255
256 bool Dictionary::isEnd()
257 {
258 return _pcur == NULL;
259 }
260
261 void Dictionary::leading_in()//導入,失敗沒必要加入順序
262 {
263 ifstream ifs;
264 const char * path = _conf.getDictionaryPath().c_str();
265 ifs.open(path);
266 if(!ifs.good())
267 {
268 cout << "open Dictionary.json error(leading_in)" << endl;
269 }else{
270 Json::Value root;
271 Json::Reader reader;
272
273 if(!reader.parse(ifs, root, false))
274 {
275 cout << "json read Dictionary.json error" << endl;
276 }else{
277 int size = root.size();
278 for(int i = 0; i < size; ++i)
279 {
280 string word = root[i]["Word"].asString();
281 int wordId = root[i]["WordId"].asInt();
282 AddWord(word, wordId);
283 ++(_dictionary->_wordId);
284 }
285 }
286 }
287 }
288
289 void Dictionary::leading_out()
290 {
291 Json::Value root;
292 Json::FastWriter writer;
293
294 resetIt();
295
296 while(!isEnd())
297 {
298 Json::Value elem;
299 elem["Word"] = getCurWord();
300 elem["WordId"] = getCurWordId();
301 root.append(elem);
302 next();
303 }
304
305 string words;
306 words = writer.write(root);
307
308 ofstream ofs;
309 const char * path = _conf.getDictionaryPath().c_str();
310 ofs.open(path);
311 if(!ofs.good())
312 {
313 cout << "open Dictionary.json error(leading_out)" << endl;
314 ofs.open("Dictionary.tmp");
315 if(!ofs.good())
316 {
317 exit(EXIT_FAILURE);
318 }
319 }
320
321 ofs << words;
322 ofs.close();
323 }
324
325 }
View Code