using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace CallCenter.Utility { public class KeywordSpliterHelper { #region 属性 private static string _SplitChar = " ";//分隔符 //用于移除停止词 private readonly static string[] _StopWordsList = new string[] { "的", "我们", "要", "自己", "之", "将", "“", "”", ",", "(", ")", "后", "应", "到", "某", "后", "个", "是", "位", "新", "一", "两", "在", "中", "或", "有", "更", "好" }; #endregion //加载keywords_default.dic文本文件数据缓存 private static SortedList _KeywordsCacheDefault = null; //加载keywords_baidu.dic文本文件数据缓存,自定义dic文件(百度关键词,或自定义关键词) private static SortedList _KeywordsCacheBaidu = null; /// /// 得到分词关键字,以逗号隔开 /// /// /// public static string DoGetKeyword(string keyText) { if (String.IsNullOrEmpty(keyText)) return ""; //if (JudgementStr(keyText)) return ""; LoadDict(); LoadDictBaidu(); #region 默认词库分词 StringBuilder sb = new StringBuilder(); ArrayList _key = SplitToList(keyText); Dictionary distinctDict = SortByDuplicateCount(_key); foreach (KeyValuePair pair in distinctDict) { sb.Append(pair.Key + ","); } #endregion #region 添加百度关键词,或自定义关键词 //若是单个长词关键词, 添加百度关键词,或自定义关键词 //bool baidu = _KeywordsCacheBaidu.ContainsKey(keyText); if (!distinctDict.ContainsKey(keyText) && _KeywordsCacheBaidu.ContainsKey(keyText)) { sb.Insert(0, keyText + ",");//前置关键词,seo较好 } else //枚举自定义词库 { string value; foreach (DictionaryEntry key in _KeywordsCacheBaidu) { value = key.Value.ToString(); if (keyText.IndexOf(value) >= 0 && !distinctDict.ContainsKey(value)) sb.Insert(0, value + ",");//前置关键词,seo较好 } } #endregion string sbStr = sb.ToString(); return string.IsNullOrEmpty(sbStr) ? "" : sbStr.ToString().Substring(0, sbStr.Length - 1); //return sb.ToString(); } // #region 读取文本 private static SortedList LoadDictFile(string FilePath) { Encoding encoding = Encoding.GetEncoding("utf-8"); SortedList arrText = new SortedList(); // try { if (!File.Exists(FilePath)) { arrText.Add("0", "文件" + FilePath + "不存在..."); } else { StreamReader objReader = new StreamReader(FilePath, encoding); string sLine = ""; while (sLine != null) { sLine = objReader.ReadLine(); if (!String.IsNullOrEmpty(sLine)) arrText.Add(sLine, sLine); } objReader.Close(); objReader.Dispose(); } } catch (Exception ex) { } return arrText; } #endregion #region 载入词典 /// /// 加载字典文件,并缓存到变量 /// /// private static SortedList LoadDict() { string filePath = GetPhysicalFilePath("keywords_default.dic"); if (_KeywordsCacheDefault == null) _KeywordsCacheDefault = LoadDictFile(filePath); return _KeywordsCacheDefault; } private static SortedList LoadDictBaidu() { string filePath = GetPhysicalFilePath("keywords_baidu.dic"); if (_KeywordsCacheBaidu == null) _KeywordsCacheBaidu = LoadDictFile(filePath); return _KeywordsCacheBaidu; } /// /// 获取物理文件路径 /// /// 文件名,如:keywords_baidu.dic /// /// private static string GetPhysicalFilePath(string dictFileName) { //判断是Web服务器环境 if (System.Web.HttpContext.Current != null) { string filePath = System.Web.HttpContext.Current.Server.MapPath("~/bin/" + dictFileName); return filePath; } else//其他环境,Winform环境 { string dir = Path.GetDirectoryName(typeof(KeywordSpliterHelper).Assembly.Location); string filePath = Path.Combine(dir, dictFileName); return filePath; } } #endregion // #region 正则检测 private static bool IsMatch(string str, string reg) { return new Regex(reg).IsMatch(str); } #endregion // #region 首先格式化字符串(粗分) private static string FormatStr(string val) { string result = ""; if (val == null || val == "") return ""; // char[] CharList = val.ToCharArray(); // string Spc = _SplitChar;//分隔符 int StrLen = CharList.Length; int CharType = 0; //0-空白 1-英文 2-中文 3-符号 // for (int i = 0; i < StrLen; i++) { string StrList = CharList[i].ToString(); if (StrList == null || StrList == "") continue; // if (CharList[i] < 0x81) { #region if (CharList[i] < 33) { if (CharType != 0 && StrList != "\n" && StrList != "\r") { result += " "; CharType = 0; } continue; } else if (IsMatch(StrList, "[^0-9a-zA-Z@\\.%#:/\\&_-]"))//排除这些字符 { if (CharType == 0) result += StrList; else result += Spc + StrList; CharType = 3; } else { if (CharType == 2 || CharType == 3) { result += Spc + StrList; CharType = 1; } else { if (IsMatch(StrList, "[@%#:]")) { result += StrList; CharType = 3; } else { result += StrList; CharType = 1; }//end if No.4 }//end if No.3 }//end if No.2 #endregion }//if No.1 else { //如果上一个字符为非中文和非空格,则加一个空格 if (CharType != 0 && CharType != 2) result += Spc; //如果是中文标点符号 if (!IsMatch(StrList, "^[\u4e00-\u9fa5]+$")) { if (CharType != 0) result += Spc + StrList; else result += StrList; CharType = 3; } else //中文 { result += StrList; CharType = 2; } } //end if No.1 }//exit for // return result; } #endregion // #region 分词 /// /// 分词 /// /// 关键词 /// private static ArrayList StringSpliter(string[] key) { ArrayList List = new ArrayList(); try { SortedList dict = LoadDict();//载入词典 // for (int i = 0; i < key.Length; i++) { if (IsMatch(key[i], @"^(?!^\.$)([a-zA-Z0-9\.\u4e00-\u9fa5]+)$")) //中文、英文、数字 { if (IsMatch(key[i], "^[\u4e00-\u9fa5]+$"))//如果是纯中文 { int keyLen = key[i].Length; if (keyLen < 2) continue; else if (keyLen <= 7) List.Add(key[i]); // //开始分词 for (int x = 0; x < keyLen; x++) { //x:起始位置//y:结束位置 for (int y = x; y < keyLen; y++) { string val = key[i].Substring(x, keyLen - y); if (val == null || val.Length < 2) break; else if (val.Length > 10) continue; if (dict.Contains(val)) List.Add(val); } // } // } else if (!IsMatch(key[i], @"^(\.*)$"))//不全是小数点 { List.Add(key[i]); } } } } catch (Exception ex) { } return List; } #endregion #region 得到分词结果 /// /// 得到分词结果 /// /// /// private static ArrayList SplitToList(string keyText) { ArrayList KeyList = StringSpliter(FormatStr(keyText).Split(_SplitChar.ToCharArray())); //去掉没用的词 for (int i = 0; i < KeyList.Count; i++) { if (IsStopword(KeyList[i].ToString())) { KeyList.RemoveAt(i); } } return KeyList; } /// /// 把一个集合按重复次数排序 /// /// /// private static Dictionary SortByDuplicateCount(ArrayList inputList) { //用于计算每个元素出现的次数,key是元素,value是出现次数 Dictionary distinctDict = new Dictionary(); for (int i = 0; i < inputList.Count; i++) { //这里没用trygetvalue,会计算两次hash if (distinctDict.ContainsKey(inputList[i].ToString())) distinctDict[inputList[i].ToString()]++; else distinctDict.Add(inputList[i].ToString(), 1); } Dictionary sortByValueDict = GetSortByValueDict(distinctDict); return sortByValueDict; } /// /// 把一个字典value的顺序排序 /// /// /// /// /// private static Dictionary GetSortByValueDict(IDictionary distinctDict) { //用于给tempDict.Values排序的临时数组 V[] tempSortList = new V[distinctDict.Count]; distinctDict.Values.CopyTo(tempSortList, 0); Array.Sort(tempSortList); //给数据排序 Array.Reverse(tempSortList);//反转 //用于保存按value排序的字典 Dictionary sortByValueDict = new Dictionary(distinctDict.Count); for (int i = 0; i < tempSortList.Length; i++) { foreach (KeyValuePair pair in distinctDict) { //比较两个泛型是否相当要用Equals,不能用==操作符 if (pair.Value.Equals(tempSortList[i]) && !sortByValueDict.ContainsKey(pair.Key)) sortByValueDict.Add(pair.Key, pair.Value); } } return sortByValueDict; } #endregion private static bool IsStopword(string str) { return _StopWordsList.Contains(str); } /// /// 用正则表达式来验证字符串是否为数字字符串 /// /// /// /// private static bool isNumberic(string message, out int result) { System.Text.RegularExpressions.Regex rex = new System.Text.RegularExpressions.Regex(@"^\d+$"); result = -1; if (rex.IsMatch(message)) { result = int.Parse(message); return true; } else return false; } /// /// 字符串过滤 /// 1、纯数字大于4个数字 /// 2、不是纯数字大于2个字 /// /// /// private static bool JudgementStr(string key) { bool b = false; byte[] keyText2 = Encoding.Default.GetBytes(key); int key_l = keyText2.Length; int result = 0; if (isNumberic(key, out result)) { // 4个数字 if (key_l < 5) b = true; } else { // 2个字 if (key_l < 4) b = true; } return b; } } }