编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫 Levenshtein Distance。一个字符串可以通过增加一个字符,删除一个字符,替换一个字符得到另外一个字符串,假设,我们把从字符串A转换成字符串B,前面3种操 作所执行的最少次数称为AB相似度
如 abc adc 度为 1
ababababa babababab 度为 2
abcd acdb 度为2
如 abc adc 度为 1
ababababa babababab 度为 2
abcd acdb 度为2
using System; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace Levenshtein { /// <summary> /// 分析完成事件委托 /// </summary> /// <param name="sim">相似度</param> public delegate void AnalyzerCompletedHander(double sim); /// <summary> /// 文章相似度工具 /// </summary> public class LevenshteinDistance:IDisposable { private string str1; private string str2; private int[,] index; int k; Task<double> task; /// <summary> /// 分析完成事件 /// </summary> public event AnalyzerCompletedHander AnalyzerCompleted; /// <summary> /// 获取或设置文章1 /// </summary> public string Str1 { get { return str1; } set { str1 = Format(value); index = new int[str1.Length, str2.Length]; } } /// <summary> /// 获取或设置文章2 /// </summary> public string Str2 { get { return str2; } set { str2 = Format(value); index = new int[str1.Length, str2.Length]; } } /// <summary> /// 运算总次数 /// </summary> public int TotalTimes { get { return str1.Length * str2.Length; } } /// <summary> /// 是否完成 /// </summary> public bool IsCompleted { get { return task.IsCompleted; } } /// <summary> /// 实例化 /// </summary> /// <param name="str1">文章1</param> /// <param name="str2">文章2</param> public LevenshteinDistance(string str1, string str2) { this.str1 = Format(str1); this.str2 = Format(str2); index = new int[str1.Length, str2.Length]; } public LevenshteinDistance() { } /// <summary> /// 异步开始任务 /// </summary> public void Start() { task = new Task<double>(Analyzer); task.Start(); task.ContinueWith(o => Completed(o.Result)); } /// <summary> /// 同步开始任务 /// </summary> /// <returns>相似度</returns> public double StartAyns() { task = new Task<double>(Analyzer); task.Start(); task.Wait(); return task.Result; } private void Completed(double s) { if (AnalyzerCompleted != null) { AnalyzerCompleted(s); } } private double Analyzer() { if (str1.Length == 0 || str2.Length == 0) return 0; for (int i = 0; i < str1.Length; i++) { for (int j = 0; j < str2.Length; j++) { k = str1[i] == str2[j] ? 0 : 1; if (i == 0&&j==0) { continue; } else if (i == 0) { index[i, j] = k + index[i, j - 1]; continue; } else if (j == 0) { index[i, j] = k + index[i - 1, j]; continue; } int temp = Min(index[i, j - 1], index[i - 1, j], index[i - 1, j - 1]); index[i, j] = temp + k; } } float similarty = 1 - (float)index[str1.Length - 1, str2.Length - 1] / (str1.Length > str2.Length ? str1.Length : str2.Length); return similarty; } private string Format(string str) { str = Regex.Replace(str, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", ""); return str; } private int Min(int a, int b, int c) { int temp = a < b ? a : b; temp = temp < c ? temp : c; return temp; } public void Dispose() { task.Dispose(); } } }