Levenshtein Distance算法(編輯距離算法),levenshteindistance
using System;
using System.Collections.Generic;
/*
* 作者:熊仔其人
* 時間:2014年4月22日
*/
namespace DataTool
{
/// <summary>
/// 相似度
/// 熊仔其人
/// 2014年4月22日
/// </summary>
public static class LevenshteinDistance
{
#region Levenshtein Distance算法(編輯距離算法)
/// <summary>
/// 三個數字中取最小的一個數字
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
/// <param name="third"></param>
/// <returns></returns>
private static int LowerOfThree(int first, int second, int third)
{
int min = first;
if (second < min)
min = second;
if (third < min)
min = third;
return min;
}
/// <summary>
/// 根據Levenshtein Distance算法(編輯距離算法)計算兩個字符串的相似度
/// </summary>
/// <param name="text1"></param>
/// <param name="text2"></param>
/// <returns></returns>
private static int Levenshtein_Distance(string text1, string text2)
{
int[,] Matrix;
int n = text1.Length;
int m = text2.Length;
int temp = 0;
char ch1;
char ch2;
int i = 0;
int j = 0;
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
Matrix = new int[n + 1, m + 1];
for (i = 0; i <= n; i++)
{
//初始化第一列
Matrix[i, 0] = i;
}
for (j = 0; j <= m; j++)
{
//初始化第一行
Matrix[0, j] = j;
}
for (i = 1; i <= n; i++)
{
ch1 = text1[i - 1];
for (j = 1; j <= m; j++)
{
ch2 = text2[j - 1];
if (ch1.Equals(ch2))
{
temp = 0;
}
else
{
temp = 1;
}
Matrix[i, j] = LowerOfThree(Matrix[i - 1, j] + 1, Matrix[i, j - 1] + 1, Matrix[i - 1, j - 1] + temp);
}
}
//for (i = 0; i <= n; i++)
//{
// for (j = 0; j <= m; j++)
// {
// Console.Write(" {0} ", Matrix[i, j]);
// }
// Console.WriteLine("");
//}
return Matrix[n, m];
}
/// <summary>
/// 根據Levenshtein Distance算法(編輯距離算法)計算兩個字符串的相似度(百分比)
/// </summary>
/// <param name="text1">第一個字符串</param>
/// <param name="text2">第二個字符串</param>
/// <returns>相似度(百分比)</returns>
public static decimal LevenshteinDistancePercent(string text1, string text2)
{
if (string.IsNullOrEmpty(text1) && string.IsNullOrEmpty(text2))
return 1;
else if (string.IsNullOrEmpty(text1) || string.IsNullOrEmpty(text2))
return 0;
int maxLenth = text1.Length > text2.Length ? text1.Length : text2.Length;
int val = Levenshtein_Distance(text1, text2);
return 1 - (decimal)val / maxLenth;
}
#endregion
#region 計算兩個字符串的相似度(百分比)
/// <summary>
/// 計算兩個字符串的相似度(百分比),比較每一個字符組成,返回結果相似度與字符順序有關,但是並不需要順序完全一致
/// </summary>
/// <param name="text1">第一個字符串</param>
/// <param name="text2">第二個字符串</param>
/// <returns>相似度(百分比)</returns>
public static decimal SimilarByStringPercent(string text1, string text2)
{
if (string.IsNullOrEmpty(text1) && string.IsNullOrEmpty(text2))
return 1;
else if (string.IsNullOrEmpty(text1) || string.IsNullOrEmpty(text2))
return 0;
decimal returnValue = 0;
int maxLength;
int i, l;
List<string> tb1 = new List<string>();
List<string> tb2 = new List<string>();
i = 0;
l = 1;
maxLength = text1.Length;
if (text1.Length < text2.Length)
maxLength = text2.Length;
while (l <= text1.Length)
{
while (i < text1.Length - 1)
{
if (i + l > text1.Length)
break;
tb1.Add(text1.Substring(i, l));
i++;
}
i = 0;
l++;
}
i = 0;
l = 1;
while (l <= text2.Length)
{
while (i < text2.Length - 1)
{
if (i + l > text2.Length)
break;
tb2.Add(text2.Substring(i, l));
i++;
}
i = 0;
l++;
}
foreach (string subStr in tb1)
{
decimal tempRe = 0;
if (tb2.Contains(subStr))
{
tempRe = (decimal)subStr.Length / maxLength;
if (tempRe > returnValue)
returnValue = tempRe;
if (tempRe == 1)
break;
}
}
return returnValue;
}
#endregion
}
}