IM12345_Api demo - 代码源于 商丘市12345项目

KeywordSpliterHelper.cs 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Linq;
  6. using System.Text;
  7. using System.Text.RegularExpressions;
  8. using System.Threading.Tasks;
  9. namespace CallCenter.Utility
  10. {
  11. public class KeywordSpliterHelper
  12. {
  13. #region 属性
  14. private static string _SplitChar = " ";//分隔符
  15. //用于移除停止词
  16. private readonly static string[] _StopWordsList = new string[] { "的", "我们", "要", "自己", "之", "将", "“", "”", ",", "(", ")", "后", "应", "到", "某", "后", "个", "是", "位", "新", "一", "两", "在", "中", "或", "有", "更", "好" };
  17. #endregion
  18. //加载keywords_default.dic文本文件数据缓存
  19. private static SortedList _KeywordsCacheDefault = null;
  20. //加载keywords_baidu.dic文本文件数据缓存,自定义dic文件(百度关键词,或自定义关键词)
  21. private static SortedList _KeywordsCacheBaidu = null;
  22. /// <summary>
  23. /// 得到分词关键字,以逗号隔开
  24. /// </summary>
  25. /// <param name="keyText"></param>
  26. /// <returns></returns>
  27. public static string DoGetKeyword(string keyText)
  28. {
  29. if (String.IsNullOrEmpty(keyText)) return "";
  30. //if (JudgementStr(keyText)) return "";
  31. LoadDict();
  32. LoadDictBaidu();
  33. #region 默认词库分词
  34. StringBuilder sb = new StringBuilder();
  35. ArrayList _key = SplitToList(keyText);
  36. Dictionary<string, int> distinctDict = SortByDuplicateCount(_key);
  37. foreach (KeyValuePair<string, int> pair in distinctDict)
  38. {
  39. sb.Append(pair.Key + ",");
  40. }
  41. #endregion
  42. #region 添加百度关键词,或自定义关键词
  43. //若是单个长词关键词, 添加百度关键词,或自定义关键词
  44. //bool baidu = _KeywordsCacheBaidu.ContainsKey(keyText);
  45. if (!distinctDict.ContainsKey(keyText) && _KeywordsCacheBaidu.ContainsKey(keyText))
  46. {
  47. sb.Insert(0, keyText + ",");//前置关键词,seo较好
  48. }
  49. else //枚举自定义词库
  50. {
  51. string value;
  52. foreach (DictionaryEntry key in _KeywordsCacheBaidu)
  53. {
  54. value = key.Value.ToString();
  55. if (keyText.IndexOf(value) >= 0 && !distinctDict.ContainsKey(value))
  56. sb.Insert(0, value + ",");//前置关键词,seo较好
  57. }
  58. }
  59. #endregion
  60. string sbStr = sb.ToString();
  61. return string.IsNullOrEmpty(sbStr) ? "" : sbStr.ToString().Substring(0, sbStr.Length - 1);
  62. //return sb.ToString();
  63. }
  64. //
  65. #region 读取文本
  66. private static SortedList LoadDictFile(string FilePath)
  67. {
  68. Encoding encoding = Encoding.GetEncoding("utf-8");
  69. SortedList arrText = new SortedList();
  70. //
  71. try
  72. {
  73. if (!File.Exists(FilePath))
  74. {
  75. arrText.Add("0", "文件" + FilePath + "不存在...");
  76. }
  77. else
  78. {
  79. StreamReader objReader = new StreamReader(FilePath, encoding);
  80. string sLine = "";
  81. while (sLine != null)
  82. {
  83. sLine = objReader.ReadLine();
  84. if (!String.IsNullOrEmpty(sLine))
  85. arrText.Add(sLine, sLine);
  86. }
  87. objReader.Close();
  88. objReader.Dispose();
  89. }
  90. }
  91. catch (Exception ex)
  92. {
  93. }
  94. return arrText;
  95. }
  96. #endregion
  97. #region 载入词典
  98. /// <summary>
  99. /// 加载字典文件,并缓存到变量
  100. /// </summary>
  101. /// <returns></returns>
  102. private static SortedList LoadDict()
  103. {
  104. string filePath = GetPhysicalFilePath("keywords_default.dic");
  105. if (_KeywordsCacheDefault == null) _KeywordsCacheDefault = LoadDictFile(filePath);
  106. return _KeywordsCacheDefault;
  107. }
  108. private static SortedList LoadDictBaidu()
  109. {
  110. string filePath = GetPhysicalFilePath("keywords_baidu.dic");
  111. if (_KeywordsCacheBaidu == null) _KeywordsCacheBaidu = LoadDictFile(filePath);
  112. return _KeywordsCacheBaidu;
  113. }
  114. /// <summary>
  115. /// 获取物理文件路径
  116. /// </summary>
  117. /// <param name="dictFileName">文件名,如:keywords_baidu.dic</param>
  118. /// <returns></returns>
  119. /// <returns></returns>
  120. private static string GetPhysicalFilePath(string dictFileName)
  121. {
  122. //判断是Web服务器环境
  123. if (System.Web.HttpContext.Current != null)
  124. {
  125. string filePath = System.Web.HttpContext.Current.Server.MapPath("~/bin/" + dictFileName);
  126. return filePath;
  127. }
  128. else//其他环境,Winform环境
  129. {
  130. string dir = Path.GetDirectoryName(typeof(KeywordSpliterHelper).Assembly.Location);
  131. string filePath = Path.Combine(dir, dictFileName);
  132. return filePath;
  133. }
  134. }
  135. #endregion
  136. //
  137. #region 正则检测
  138. private static bool IsMatch(string str, string reg)
  139. {
  140. return new Regex(reg).IsMatch(str);
  141. }
  142. #endregion
  143. //
  144. #region 首先格式化字符串(粗分)
  145. private static string FormatStr(string val)
  146. {
  147. string result = "";
  148. if (val == null || val == "")
  149. return "";
  150. //
  151. char[] CharList = val.ToCharArray();
  152. //
  153. string Spc = _SplitChar;//分隔符
  154. int StrLen = CharList.Length;
  155. int CharType = 0; //0-空白 1-英文 2-中文 3-符号
  156. //
  157. for (int i = 0; i < StrLen; i++)
  158. {
  159. string StrList = CharList[i].ToString();
  160. if (StrList == null || StrList == "")
  161. continue;
  162. //
  163. if (CharList[i] < 0x81)
  164. {
  165. #region
  166. if (CharList[i] < 33)
  167. {
  168. if (CharType != 0 && StrList != "\n" && StrList != "\r")
  169. {
  170. result += " ";
  171. CharType = 0;
  172. }
  173. continue;
  174. }
  175. else if (IsMatch(StrList, "[^0-9a-zA-Z@\\.%#:/\\&_-]"))//排除这些字符
  176. {
  177. if (CharType == 0)
  178. result += StrList;
  179. else
  180. result += Spc + StrList;
  181. CharType = 3;
  182. }
  183. else
  184. {
  185. if (CharType == 2 || CharType == 3)
  186. {
  187. result += Spc + StrList;
  188. CharType = 1;
  189. }
  190. else
  191. {
  192. if (IsMatch(StrList, "[@%#:]"))
  193. {
  194. result += StrList;
  195. CharType = 3;
  196. }
  197. else
  198. {
  199. result += StrList;
  200. CharType = 1;
  201. }//end if No.4
  202. }//end if No.3
  203. }//end if No.2
  204. #endregion
  205. }//if No.1
  206. else
  207. {
  208. //如果上一个字符为非中文和非空格,则加一个空格
  209. if (CharType != 0 && CharType != 2)
  210. result += Spc;
  211. //如果是中文标点符号
  212. if (!IsMatch(StrList, "^[\u4e00-\u9fa5]+$"))
  213. {
  214. if (CharType != 0)
  215. result += Spc + StrList;
  216. else
  217. result += StrList;
  218. CharType = 3;
  219. }
  220. else //中文
  221. {
  222. result += StrList;
  223. CharType = 2;
  224. }
  225. }
  226. //end if No.1
  227. }//exit for
  228. //
  229. return result;
  230. }
  231. #endregion
  232. //
  233. #region 分词
  234. /// <summary>
  235. /// 分词
  236. /// </summary>
  237. /// <param name="key">关键词</param>
  238. /// <returns></returns>
  239. private static ArrayList StringSpliter(string[] key)
  240. {
  241. ArrayList List = new ArrayList();
  242. try
  243. {
  244. SortedList dict = LoadDict();//载入词典
  245. //
  246. for (int i = 0; i < key.Length; i++)
  247. {
  248. if (IsMatch(key[i], @"^(?!^\.$)([a-zA-Z0-9\.\u4e00-\u9fa5]+)$")) //中文、英文、数字
  249. {
  250. if (IsMatch(key[i], "^[\u4e00-\u9fa5]+$"))//如果是纯中文
  251. {
  252. int keyLen = key[i].Length;
  253. if (keyLen < 2)
  254. continue;
  255. else if (keyLen <= 7)
  256. List.Add(key[i]);
  257. //
  258. //开始分词
  259. for (int x = 0; x < keyLen; x++)
  260. {
  261. //x:起始位置//y:结束位置
  262. for (int y = x; y < keyLen; y++)
  263. {
  264. string val = key[i].Substring(x, keyLen - y);
  265. if (val == null || val.Length < 2)
  266. break;
  267. else if (val.Length > 10)
  268. continue;
  269. if (dict.Contains(val))
  270. List.Add(val);
  271. }
  272. //
  273. }
  274. //
  275. }
  276. else if (!IsMatch(key[i], @"^(\.*)$"))//不全是小数点
  277. {
  278. List.Add(key[i]);
  279. }
  280. }
  281. }
  282. }
  283. catch (Exception ex)
  284. {
  285. }
  286. return List;
  287. }
  288. #endregion
  289. #region 得到分词结果
  290. /// <summary>
  291. /// 得到分词结果
  292. /// </summary>
  293. /// <param name="keyText"></param>
  294. /// <returns></returns>
  295. private static ArrayList SplitToList(string keyText)
  296. {
  297. ArrayList KeyList = StringSpliter(FormatStr(keyText).Split(_SplitChar.ToCharArray()));
  298. //去掉没用的词
  299. for (int i = 0; i < KeyList.Count; i++)
  300. {
  301. if (IsStopword(KeyList[i].ToString()))
  302. {
  303. KeyList.RemoveAt(i);
  304. }
  305. }
  306. return KeyList;
  307. }
  308. /// <summary>
  309. /// 把一个集合按重复次数排序
  310. /// </summary>
  311. /// <param name="inputList"></param>
  312. /// <returns></returns>
  313. private static Dictionary<string, int> SortByDuplicateCount(ArrayList inputList)
  314. {
  315. //用于计算每个元素出现的次数,key是元素,value是出现次数
  316. Dictionary<string, int> distinctDict = new Dictionary<string, int>();
  317. for (int i = 0; i < inputList.Count; i++)
  318. {
  319. //这里没用trygetvalue,会计算两次hash
  320. if (distinctDict.ContainsKey(inputList[i].ToString()))
  321. distinctDict[inputList[i].ToString()]++;
  322. else
  323. distinctDict.Add(inputList[i].ToString(), 1);
  324. }
  325. Dictionary<string, int> sortByValueDict = GetSortByValueDict(distinctDict);
  326. return sortByValueDict;
  327. }
  328. /// <summary>
  329. /// 把一个字典value的顺序排序
  330. /// </summary>
  331. /// <typeparam name="K"></typeparam>
  332. /// <typeparam name="V"></typeparam>
  333. /// <param name="distinctDict"></param>
  334. /// <returns></returns>
  335. private static Dictionary<K, V> GetSortByValueDict<K, V>(IDictionary<K, V> distinctDict)
  336. {
  337. //用于给tempDict.Values排序的临时数组
  338. V[] tempSortList = new V[distinctDict.Count];
  339. distinctDict.Values.CopyTo(tempSortList, 0);
  340. Array.Sort(tempSortList); //给数据排序
  341. Array.Reverse(tempSortList);//反转
  342. //用于保存按value排序的字典
  343. Dictionary<K, V> sortByValueDict = new Dictionary<K, V>(distinctDict.Count);
  344. for (int i = 0; i < tempSortList.Length; i++)
  345. {
  346. foreach (KeyValuePair<K, V> pair in distinctDict)
  347. {
  348. //比较两个泛型是否相当要用Equals,不能用==操作符
  349. if (pair.Value.Equals(tempSortList[i]) && !sortByValueDict.ContainsKey(pair.Key))
  350. sortByValueDict.Add(pair.Key, pair.Value);
  351. }
  352. }
  353. return sortByValueDict;
  354. }
  355. #endregion
  356. private static bool IsStopword(string str)
  357. {
  358. return _StopWordsList.Contains(str);
  359. }
  360. /// <summary>
  361. /// 用正则表达式来验证字符串是否为数字字符串
  362. /// </summary>
  363. /// <param name="message"></param>
  364. /// <param name="result"></param>
  365. /// <returns></returns>
  366. private static bool isNumberic(string message, out int result)
  367. {
  368. System.Text.RegularExpressions.Regex rex = new System.Text.RegularExpressions.Regex(@"^\d+$");
  369. result = -1;
  370. if (rex.IsMatch(message))
  371. {
  372. result = int.Parse(message);
  373. return true;
  374. }
  375. else
  376. return false;
  377. }
  378. /// <summary>
  379. /// 字符串过滤
  380. /// 1、纯数字大于4个数字
  381. /// 2、不是纯数字大于2个字
  382. /// </summary>
  383. /// <param name="key"></param>
  384. /// <returns></returns>
  385. private static bool JudgementStr(string key)
  386. {
  387. bool b = false;
  388. byte[] keyText2 = Encoding.Default.GetBytes(key);
  389. int key_l = keyText2.Length;
  390. int result = 0;
  391. if (isNumberic(key, out result))
  392. {
  393. // 4个数字
  394. if (key_l < 5)
  395. b = true;
  396. }
  397. else
  398. {
  399. // 2个字
  400. if (key_l < 4)
  401. b = true;
  402. }
  403. return b;
  404. }
  405. }
  406. }