抢分的进来啊小弟我想做一个搜索时进行分词的

2012-01-18

抢分的进来啊，我想做一个搜索时进行分词的。首先不论输入英文，中文，数字结合都可以搜到。如:aspx2.0分词第一

抢分的进来啊，我想做一个搜索时进行分词的。
首先不论输入英文，中文，数字结合都可以搜到。
如:aspx2.0分词

第一次得出与 "aspx2.0分词 "相同的内容.
第二次得出 "aspx "相关的内容.
第三次得出 "aspx2.0 "相关的内容.
第三次得出 "分词 "相关的内容.
第四次得出 "2.0 "想关的内容.
第五次得出 "分词2.0 "相关的内容。

那位高手做下，越接近我的要求越好。谢谢。

[解决办法]
贴出来，大家提提意见
public class SplitWord
{
System.Collections.Generic.List <string> dictionary = new List <string> ();
private string[] delimiters = new string[] { " ", " ", ". ", "《 ", "》 ", "+ ", "% ", "? ", "[ ", "] ", "{ ", "} ", "( ", ") ",
"（ ", "） ", "\r\n ", ", ", "， ", "。 ", " ' ", "‘ ", "’ ", "“ ", "” " , "\ " ", "\\ ", "* ", "？ ", "| " , "; ", ": "};

private string debugInfo = " ";
public string DebugInfo
{
get { return debugInfo; }
}
public SplitWord()
{
dictionary.Clear();
LoadDefaultDictionary();

}

public SplitWord(string[] dictionaryfiles )
{
dictionary.Clear();
for (int i = 0; i < dictionaryfiles.Length ; i++)
{
try
{
loadOneDict( dictionaryfiles[i],false );
}
catch (Exception ex)
{
writeLine( "词库装载失败: " + ex.Message);
}
}
}

public string[] Delimitor
{
get { return delimiters; }
}

private void LoadDefaultDictionary()
{
string prefix = AppDomain.CurrentDomain.SetupInformation.ApplicationBase + @ "\ " ;
loadOneDict(prefix+ "StandardWord213663.TXT ",false );
loadOneDict(prefix + "VisaFormWord.TXT ",true );
}

public void loadOneDict(string filename,bool check)
{
try
{
StreamReader streamReader = new StreamReader(filename, Encoding.Default);
string line;
int i = 0;

while ((line = streamReader.ReadLine()) != null)
{
try
{
if (check)
{
if (!dictionary.Contains(line.Trim().ToLowerInvariant()))
{
dictionary.Add(line.Trim().ToLowerInvariant());
i++;
}
}
else
{
dictionary.Add(line.Trim());
}
}
catch (Exception ex)
{
writeLine( "[ " + line + "] 已经在字典中 ");

}
}

streamReader.Close();

writeLine( "词库装载成功 " + i.ToString());

}
catch (Exception ex)
{
writeLine( "词库装载失败 " + ex.Message);
}
}

private void writeLine(string s)
{
debugInfo = debugInfo + "\r\n " + s;
}

public string Split(string inputString)
{
string[] preInputStr = inputString.Split(delimiters,10000,StringSplitOptions.RemoveEmptyEntries );
string output = " ";
for (int i = 0; i < preInputStr.Length; i++)
{
output = output + " "+ extractOne(preInputStr[i]);
}

return output.Trim();
}

private string extractOne(string inStr)
{
if (inStr.Length == 0) return " ";

char curChar = inStr.Substring(0, 1).ToLower().ToCharArray()[0];

string outStr = " ";

if ((curChar > = 'a ' && curChar <= 'z ')
|| (curChar > = '0 ' && curChar <= '9 '))
{
outStr = extractEnglishNumber(inStr );
}
else
{
outStr = extractChinese(inStr);
}
return outStr;
}

private string extractEnglishNumber(string left)
{

string EnNumWord = " ";
int pos1 = 0;

char curChar ;

do
{
EnNumWord += left.Substring(pos1, 1);
pos1++;
if (pos1 == left.Length ) return EnNumWord;

curChar = left.Substring(pos1, 1).ToLower().ToCharArray()[0];
} while ((curChar > = 'a ' && curChar <= 'z ')
|| (curChar > = '0 ' && curChar <= '9 ') || curChar == '- '||curChar == '. '|| curChar == '@ ' || curChar == '_ ');

return EnNumWord + " " + extractOne( left.Substring(pos1 ));
}

private string extractChinese(string left)
{
string wordForMatch = " ";
string outWords = " ";

int pos = 7;
if (left.Length < 7)
{
pos = left.Length;
}

while ( pos > 0)
{
wordForMatch = left.Substring(0, pos);
if (dictionary.Contains(wordForMatch.ToLowerInvariant() ))
{
if (left.Length > pos)
{
outWords = " " + extractOne(left.Substring(pos));
}
return wordForMatch + outWords;
}
if (pos == 1)
{
return left.Substring(0, pos) + " " + extractOne(left.Substring(pos));
}
pos--;
}

return left;

}

}

热点排行

asp.net

抢分的进来啊小弟我想做一个搜索时进行分词的