抢分的进来啊,我想做一个搜索时进行分词的。
首先不论输入英文,中文,数字结合都可以搜到。
如:aspx2.0分词
第一次得出与 "aspx2.0分词 "相同的内容.
第二次得出 "aspx "相关的内容.
第三次得出 "aspx2.0 "相关的内容.
第三次得出 "分词 "相关的内容.
第四次得出 "2.0 "想关的内容.
第五次得出 "分词2.0 "相关的内容。
那位高手做下,越接近我的要求越好。谢谢。
[解决办法]
贴出来,大家提提意见
public class SplitWord
{
System.Collections.Generic.List <string> dictionary = new List <string> ();
private string[] delimiters = new string[] { " ", " ", ". ", "《 ", "》 ", "+ ", "% ", "? ", "[ ", "] ", "{ ", "} ", "( ", ") ",
"( ", ") ", "\r\n ", ", ", ", ", "。 ", " ' ", "‘ ", "’ ", "“ ", "” " , "\ " ", "\\ ", "* ", "? ", "| " , "; ", ": "};
private string debugInfo = " ";
public string DebugInfo
{
get { return debugInfo; }
}
public SplitWord()
{
dictionary.Clear();
LoadDefaultDictionary();
}
public SplitWord(string[] dictionaryfiles )
{
dictionary.Clear();
for (int i = 0; i < dictionaryfiles.Length ; i++)
{
try
{
loadOneDict( dictionaryfiles[i],false );
}
catch (Exception ex)
{
writeLine( "词库装载失败: " + ex.Message);
}
}
}
public string[] Delimitor
{
get { return delimiters; }
}
private void LoadDefaultDictionary()
{
string prefix = AppDomain.CurrentDomain.SetupInformation.ApplicationBase + @ "\ " ;
loadOneDict(prefix+ "StandardWord213663.TXT ",false );
loadOneDict(prefix + "VisaFormWord.TXT ",true );
}
public void loadOneDict(string filename,bool check)
{
try
{
StreamReader streamReader = new StreamReader(filename, Encoding.Default);
string line;
int i = 0;
while ((line = streamReader.ReadLine()) != null)
{
try
{
if (check)
{
if (!dictionary.Contains(line.Trim().ToLowerInvariant()))
{
dictionary.Add(line.Trim().ToLowerInvariant());
i++;
}
}
else
{
dictionary.Add(line.Trim());
}
}
catch (Exception ex)
{
writeLine( "[ " + line + "] 已经在字典中 ");
}
}
streamReader.Close();
writeLine( "词库装载成功 " + i.ToString());
}
catch (Exception ex)
{
writeLine( "词库装载失败 " + ex.Message);
}
}
private void writeLine(string s)
{
debugInfo = debugInfo + "\r\n " + s;
}
public string Split(string inputString)
{
string[] preInputStr = inputString.Split(delimiters,10000,StringSplitOptions.RemoveEmptyEntries );
string output = " ";
for (int i = 0; i < preInputStr.Length; i++)
{
output = output + " "+ extractOne(preInputStr[i]);
}
return output.Trim();
}
private string extractOne(string inStr)
{
if (inStr.Length == 0) return " ";
char curChar = inStr.Substring(0, 1).ToLower().ToCharArray()[0];
string outStr = " ";
if ((curChar > = 'a ' && curChar <= 'z ')
|| (curChar > = '0 ' && curChar <= '9 '))
{
outStr = extractEnglishNumber(inStr );
}
else
{
outStr = extractChinese(inStr);
}
return outStr;
}
private string extractEnglishNumber(string left)
{
string EnNumWord = " ";
int pos1 = 0;
char curChar ;
do
{
EnNumWord += left.Substring(pos1, 1);
pos1++;
if (pos1 == left.Length ) return EnNumWord;
curChar = left.Substring(pos1, 1).ToLower().ToCharArray()[0];
} while ((curChar > = 'a ' && curChar <= 'z ')
|| (curChar > = '0 ' && curChar <= '9 ') || curChar == '- '||curChar == '. '|| curChar == '@ ' || curChar == '_ ');
return EnNumWord + " " + extractOne( left.Substring(pos1 ));
}
private string extractChinese(string left)
{
string wordForMatch = " ";
string outWords = " ";
int pos = 7;
if (left.Length < 7)
{
pos = left.Length;
}
while ( pos > 0)
{
wordForMatch = left.Substring(0, pos);
if (dictionary.Contains(wordForMatch.ToLowerInvariant() ))
{
if (left.Length > pos)
{
outWords = " " + extractOne(left.Substring(pos));
}
return wordForMatch + outWords;
}
if (pos == 1)
{
return left.Substring(0, pos) + " " + extractOne(left.Substring(pos));
}
pos--;
}
return left;
}
}