| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- using System;
- using System.Collections;
- using System.Collections.Generic;
- using System.IO;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Threading.Tasks;
- namespace CallCenter.Utility
- {
- public class KeywordSpliterHelper
- {
- #region 属性
- private static string _SplitChar = " ";//分隔符
- //用于移除停止词
- private readonly static string[] _StopWordsList = new string[] { "的", "我们", "要", "自己", "之", "将", "“", "”", ",", "(", ")", "后", "应", "到", "某", "后", "个", "是", "位", "新", "一", "两", "在", "中", "或", "有", "更", "好" };
- #endregion
- //加载keywords_default.dic文本文件数据缓存
- private static SortedList _KeywordsCacheDefault = null;
- //加载keywords_baidu.dic文本文件数据缓存,自定义dic文件(百度关键词,或自定义关键词)
- private static SortedList _KeywordsCacheBaidu = null;
- /// <summary>
- /// 得到分词关键字,以逗号隔开
- /// </summary>
- /// <param name="keyText"></param>
- /// <returns></returns>
- public static string DoGetKeyword(string keyText)
- {
- if (String.IsNullOrEmpty(keyText)) return "";
- //if (JudgementStr(keyText)) return "";
- LoadDict();
- LoadDictBaidu();
- #region 默认词库分词
- StringBuilder sb = new StringBuilder();
- ArrayList _key = SplitToList(keyText);
- Dictionary<string, int> distinctDict = SortByDuplicateCount(_key);
- foreach (KeyValuePair<string, int> pair in distinctDict)
- {
- sb.Append(pair.Key + ",");
- }
- #endregion
- #region 添加百度关键词,或自定义关键词
- //若是单个长词关键词, 添加百度关键词,或自定义关键词
- //bool baidu = _KeywordsCacheBaidu.ContainsKey(keyText);
- if (!distinctDict.ContainsKey(keyText) && _KeywordsCacheBaidu.ContainsKey(keyText))
- {
- sb.Insert(0, keyText + ",");//前置关键词,seo较好
- }
- else //枚举自定义词库
- {
- string value;
- foreach (DictionaryEntry key in _KeywordsCacheBaidu)
- {
- value = key.Value.ToString();
- if (keyText.IndexOf(value) >= 0 && !distinctDict.ContainsKey(value))
- sb.Insert(0, value + ",");//前置关键词,seo较好
- }
- }
- #endregion
- string sbStr = sb.ToString();
- return string.IsNullOrEmpty(sbStr) ? "" : sbStr.ToString().Substring(0, sbStr.Length - 1);
- //return sb.ToString();
- }
- //
- #region 读取文本
- private static SortedList LoadDictFile(string FilePath)
- {
- Encoding encoding = Encoding.GetEncoding("utf-8");
- SortedList arrText = new SortedList();
- //
- try
- {
- if (!File.Exists(FilePath))
- {
- arrText.Add("0", "文件" + FilePath + "不存在...");
- }
- else
- {
- StreamReader objReader = new StreamReader(FilePath, encoding);
- string sLine = "";
- while (sLine != null)
- {
- sLine = objReader.ReadLine();
- if (!String.IsNullOrEmpty(sLine))
- arrText.Add(sLine, sLine);
- }
- objReader.Close();
- objReader.Dispose();
- }
- }
- catch (Exception ex)
- {
- }
- return arrText;
- }
- #endregion
- #region 载入词典
- /// <summary>
- /// 加载字典文件,并缓存到变量
- /// </summary>
- /// <returns></returns>
- private static SortedList LoadDict()
- {
- string filePath = GetPhysicalFilePath("keywords_default.dic");
- if (_KeywordsCacheDefault == null) _KeywordsCacheDefault = LoadDictFile(filePath);
- return _KeywordsCacheDefault;
- }
- private static SortedList LoadDictBaidu()
- {
- string filePath = GetPhysicalFilePath("keywords_baidu.dic");
- if (_KeywordsCacheBaidu == null) _KeywordsCacheBaidu = LoadDictFile(filePath);
- return _KeywordsCacheBaidu;
- }
- /// <summary>
- /// 获取物理文件路径
- /// </summary>
- /// <param name="dictFileName">文件名,如:keywords_baidu.dic</param>
- /// <returns></returns>
- /// <returns></returns>
- private static string GetPhysicalFilePath(string dictFileName)
- {
- //判断是Web服务器环境
- if (System.Web.HttpContext.Current != null)
- {
- string filePath = System.Web.HttpContext.Current.Server.MapPath("~/bin/" + dictFileName);
- return filePath;
- }
- else//其他环境,Winform环境
- {
- string dir = Path.GetDirectoryName(typeof(KeywordSpliterHelper).Assembly.Location);
- string filePath = Path.Combine(dir, dictFileName);
- return filePath;
- }
- }
- #endregion
- //
- #region 正则检测
- private static bool IsMatch(string str, string reg)
- {
- return new Regex(reg).IsMatch(str);
- }
- #endregion
- //
- #region 首先格式化字符串(粗分)
- private static string FormatStr(string val)
- {
- string result = "";
- if (val == null || val == "")
- return "";
- //
- char[] CharList = val.ToCharArray();
- //
- string Spc = _SplitChar;//分隔符
- int StrLen = CharList.Length;
- int CharType = 0; //0-空白 1-英文 2-中文 3-符号
- //
- for (int i = 0; i < StrLen; i++)
- {
- string StrList = CharList[i].ToString();
- if (StrList == null || StrList == "")
- continue;
- //
- if (CharList[i] < 0x81)
- {
- #region
- if (CharList[i] < 33)
- {
- if (CharType != 0 && StrList != "\n" && StrList != "\r")
- {
- result += " ";
- CharType = 0;
- }
- continue;
- }
- else if (IsMatch(StrList, "[^0-9a-zA-Z@\\.%#:/\\&_-]"))//排除这些字符
- {
- if (CharType == 0)
- result += StrList;
- else
- result += Spc + StrList;
- CharType = 3;
- }
- else
- {
- if (CharType == 2 || CharType == 3)
- {
- result += Spc + StrList;
- CharType = 1;
- }
- else
- {
- if (IsMatch(StrList, "[@%#:]"))
- {
- result += StrList;
- CharType = 3;
- }
- else
- {
- result += StrList;
- CharType = 1;
- }//end if No.4
- }//end if No.3
- }//end if No.2
- #endregion
- }//if No.1
- else
- {
- //如果上一个字符为非中文和非空格,则加一个空格
- if (CharType != 0 && CharType != 2)
- result += Spc;
- //如果是中文标点符号
- if (!IsMatch(StrList, "^[\u4e00-\u9fa5]+$"))
- {
- if (CharType != 0)
- result += Spc + StrList;
- else
- result += StrList;
- CharType = 3;
- }
- else //中文
- {
- result += StrList;
- CharType = 2;
- }
- }
- //end if No.1
- }//exit for
- //
- return result;
- }
- #endregion
- //
- #region 分词
- /// <summary>
- /// 分词
- /// </summary>
- /// <param name="key">关键词</param>
- /// <returns></returns>
- private static ArrayList StringSpliter(string[] key)
- {
- ArrayList List = new ArrayList();
- try
- {
- SortedList dict = LoadDict();//载入词典
- //
- for (int i = 0; i < key.Length; i++)
- {
- if (IsMatch(key[i], @"^(?!^\.$)([a-zA-Z0-9\.\u4e00-\u9fa5]+)$")) //中文、英文、数字
- {
- if (IsMatch(key[i], "^[\u4e00-\u9fa5]+$"))//如果是纯中文
- {
- int keyLen = key[i].Length;
- if (keyLen < 2)
- continue;
- else if (keyLen <= 7)
- List.Add(key[i]);
- //
- //开始分词
- for (int x = 0; x < keyLen; x++)
- {
- //x:起始位置//y:结束位置
- for (int y = x; y < keyLen; y++)
- {
- string val = key[i].Substring(x, keyLen - y);
- if (val == null || val.Length < 2)
- break;
- else if (val.Length > 10)
- continue;
- if (dict.Contains(val))
- List.Add(val);
- }
- //
- }
- //
- }
- else if (!IsMatch(key[i], @"^(\.*)$"))//不全是小数点
- {
- List.Add(key[i]);
- }
- }
- }
- }
- catch (Exception ex)
- {
- }
- return List;
- }
- #endregion
- #region 得到分词结果
- /// <summary>
- /// 得到分词结果
- /// </summary>
- /// <param name="keyText"></param>
- /// <returns></returns>
- private static ArrayList SplitToList(string keyText)
- {
- ArrayList KeyList = StringSpliter(FormatStr(keyText).Split(_SplitChar.ToCharArray()));
- //去掉没用的词
- for (int i = 0; i < KeyList.Count; i++)
- {
- if (IsStopword(KeyList[i].ToString()))
- {
- KeyList.RemoveAt(i);
- }
- }
- return KeyList;
- }
- /// <summary>
- /// 把一个集合按重复次数排序
- /// </summary>
- /// <param name="inputList"></param>
- /// <returns></returns>
- private static Dictionary<string, int> SortByDuplicateCount(ArrayList inputList)
- {
- //用于计算每个元素出现的次数,key是元素,value是出现次数
- Dictionary<string, int> distinctDict = new Dictionary<string, int>();
- for (int i = 0; i < inputList.Count; i++)
- {
- //这里没用trygetvalue,会计算两次hash
- if (distinctDict.ContainsKey(inputList[i].ToString()))
- distinctDict[inputList[i].ToString()]++;
- else
- distinctDict.Add(inputList[i].ToString(), 1);
- }
- Dictionary<string, int> sortByValueDict = GetSortByValueDict(distinctDict);
- return sortByValueDict;
- }
- /// <summary>
- /// 把一个字典value的顺序排序
- /// </summary>
- /// <typeparam name="K"></typeparam>
- /// <typeparam name="V"></typeparam>
- /// <param name="distinctDict"></param>
- /// <returns></returns>
- private static Dictionary<K, V> GetSortByValueDict<K, V>(IDictionary<K, V> distinctDict)
- {
- //用于给tempDict.Values排序的临时数组
- V[] tempSortList = new V[distinctDict.Count];
- distinctDict.Values.CopyTo(tempSortList, 0);
- Array.Sort(tempSortList); //给数据排序
- Array.Reverse(tempSortList);//反转
- //用于保存按value排序的字典
- Dictionary<K, V> sortByValueDict = new Dictionary<K, V>(distinctDict.Count);
- for (int i = 0; i < tempSortList.Length; i++)
- {
- foreach (KeyValuePair<K, V> pair in distinctDict)
- {
- //比较两个泛型是否相当要用Equals,不能用==操作符
- if (pair.Value.Equals(tempSortList[i]) && !sortByValueDict.ContainsKey(pair.Key))
- sortByValueDict.Add(pair.Key, pair.Value);
- }
- }
- return sortByValueDict;
- }
- #endregion
- private static bool IsStopword(string str)
- {
- return _StopWordsList.Contains(str);
- }
- /// <summary>
- /// 用正则表达式来验证字符串是否为数字字符串
- /// </summary>
- /// <param name="message"></param>
- /// <param name="result"></param>
- /// <returns></returns>
- private static bool isNumberic(string message, out int result)
- {
- System.Text.RegularExpressions.Regex rex = new System.Text.RegularExpressions.Regex(@"^\d+$");
- result = -1;
- if (rex.IsMatch(message))
- {
- result = int.Parse(message);
- return true;
- }
- else
- return false;
- }
- /// <summary>
- /// 字符串过滤
- /// 1、纯数字大于4个数字
- /// 2、不是纯数字大于2个字
- /// </summary>
- /// <param name="key"></param>
- /// <returns></returns>
- private static bool JudgementStr(string key)
- {
- bool b = false;
- byte[] keyText2 = Encoding.Default.GetBytes(key);
- int key_l = keyText2.Length;
- int result = 0;
- if (isNumberic(key, out result))
- {
- // 4个数字
- if (key_l < 5)
- b = true;
- }
- else
- {
- // 2个字
- if (key_l < 4)
- b = true;
- }
- return b;
- }
- }
- }
|