#region Copyright And Revision History
/*---------------------------------------------------------------------------
MyersDiff.cs
Copyright © 2002 Bill Menees. All rights reserved.
Bill@Menees.com
Who When What
------- ---------- -----------------------------------------------------
BMenees 10.20.2002 Created.
-----------------------------------------------------------------------------*/
#endregion
using System;
using System.Diagnostics;
using System.Drawing;
using System.Collections.Generic;
namespace Menees.DiffUtils
{
///
/// This class implements the differencing algorithm from
/// "An O(ND) Difference Algorithm And Its Variations" by
/// Eugene W. Myers. It is the standard algorithm used by
/// the UNIX diff utilities.
///
/// This implementation diffs two comparable arrays. It is
/// typically used with arrays of hash values where each
/// hash corresponds to a line of text. Then it can be
/// used to diff two text files on a line-by-line basis.
///
public class MyersDiff where T : IComparable
{
#region Constructors
public MyersDiff(T[] arA, T[] arB, bool bSupportChangeEditType)
{
m_arA = arA;
m_arB = arB;
m_bSupportChangeEditType = bSupportChangeEditType;
int N = arA.Length;
int M = arB.Length;
m_vecForward = new DiagonalVector(N, M);
m_vecReverse = new DiagonalVector(N, M);
}
#endregion
#region Public Methods
///
/// Returns an EditScript instance that gives all the Edits
/// necessary to transform A into B.
///
public EditScript Execute()
{
List MatchPoints = new List();
SubArray A = new SubArray(m_arA);
SubArray B = new SubArray(m_arB);
GetMatchPoints(A, B, MatchPoints);
Debug.Assert(MatchPoints.Count == GetLCSLength(), "The number of match points must equal the LCS length.");
EditScript Script = ConvertMatchPointsToEditScript(A.Length, B.Length, MatchPoints);
Debug.Assert(Script.TotalEditLength == GetSESLength());
return Script;
}
///
/// Returns the longest common subsequence from A and B.
///
public IList GetLCS()
{
List Output = new List();
GetLCS(new SubArray(m_arA), new SubArray(m_arB), Output);
return Output;
}
///
/// Calculates the length that the LCS should be without
/// actually determining the LCS.
///
public int GetLCSLength()
{
//Per Myers's paper, we should always have
//D+2L == N+M. So L == (N+M-D)/2.
return (m_arA.Length + m_arB.Length - GetSESLength()) / 2;
}
///
/// Returns a similary index between 0 and 1 inclusive.
/// 0 means A and B are completely different. 1 means
/// they are exactly alike. The similarity index is
/// calculated as twice the length of the LCS divided
/// by the sum of A and B's lengths.
///
public double GetSimilarity()
{
return (2.0*GetLCSLength())/(double)(m_arA.Length + m_arB.Length);
}
///
/// Calculates the length of the "shortest edit script"
/// as defined in Myers's paper.
///
/// Note: This may not be the same as the Count property
/// of an EditScript instance returned by Execute(). If
/// an EditScript instance has any Edits with Length > 1,
/// then those groupings will make EditScript.Count less
/// than GetSESLength(). Similarly, an Edit with EditType
/// Change should be thought of as a combined Delete and
/// Insert for the specified Length.
///
public int GetSESLength()
{
SubArray A = new SubArray(m_arA);
SubArray B = new SubArray(m_arB);
if (SetupFictitiousPoints(A, B))
{
int N = m_arA.Length;
int M = m_arB.Length;
for (int D = 0; D <= N+M; D++)
{
for (int k = -D; k <= D; k+=2)
{
int x = GetForwardDPaths(A, B, D, k);
int y = x - k;
if (x >= N && y >= M)
{
return D;
}
}
}
//We should never get here if the algorithm is coded correctly.
Debug.Assert(false);
return -1;
}
else if (m_arA.Length == 0)
{
return m_arB.Length;
}
else
{
return m_arA.Length;
}
}
///
/// Gets the length of the "shortest edit script"
/// by running the algorithm in reverse. We should
/// always have GetSESLength() == GetReverseSESLength().
///
public int GetReverseSESLength()
{
SubArray A = new SubArray(m_arA);
SubArray B = new SubArray(m_arB);
if (SetupFictitiousPoints(A, B))
{
int N = m_arA.Length;
int M = m_arB.Length;
int iDelta = N-M;
for (int D = 0; D <= N+M; D++)
{
for (int k = -D; k <= D; k+=2)
{
int x = GetReverseDPaths(A, B, D, k, iDelta);
int y = x - (k+iDelta);
if (x <= 0 && y <= 0)
{
return D;
}
}
}
//We should never get here if the algorithm is coded correctly.
Debug.Assert(false);
return -1;
}
else if (m_arA.Length == 0)
{
return m_arB.Length;
}
else
{
return m_arA.Length;
}
}
#endregion
#region Private Methods
private bool SetupFictitiousPoints(SubArray A, SubArray B)
{
if (A.Length > 0 && B.Length > 0)
{
//Setup some "fictious" endpoints for initial forward
//and reverse path navigation.
m_vecForward[1] = 0;
int N = A.Length;
int M = B.Length;
int iDelta = N-M;
m_vecReverse[iDelta + 1] = N + 1;
return true;
}
else
return false;
}
private int GetForwardDPaths(SubArray A, SubArray B, int D, int k)
{
DiagonalVector V = m_vecForward;
int x;
if ((k == -D) || (k != D && V[k-1] < V[k+1]))
{
x = V[k+1];
}
else
{
x = V[k-1]+1;
}
int y = x - k;
while (x < A.Length && y < B.Length && A[x+1].CompareTo(B[y+1]) == 0)
{
x++;
y++;
}
V[k] = x;
return x;
}
private int GetReverseDPaths(SubArray A, SubArray B, int D, int k, int iDelta)
{
DiagonalVector V = m_vecReverse;
int p = k + iDelta;
int x;
if ((k == -D) || (k != D && V[p+1] <= V[p-1]))
{
x = V[p+1]-1;
}
else
{
x = V[p-1];
}
int y = x - p;
while (x > 0 && y > 0 && A[x].CompareTo(B[y]) == 0)
{
x--;
y--;
}
V[p] = x;
return x;
}
private int FindMiddleSnake(SubArray A, SubArray B, out int iPathStartX, out int iPathEndX, out int iPathK)
{
//We don't have to check the result of this because the calling procedure
//has already check the length preconditions.
SetupFictitiousPoints(A, B);
iPathStartX = -1;
iPathEndX = -1;
iPathK = 0;
int iDelta = A.Length - B.Length;
int iCeiling = (int)Math.Ceiling((A.Length + B.Length) / 2.0);
for (int D = 0; D <= iCeiling; D++)
{
for (int k = -D; k <= D; k+=2)
{
//Find the end of the furthest reaching forward D-path in diagonal k.
GetForwardDPaths(A, B, D, k);
//If iDelta is odd (i.e. remainder == 1 or -1) and ...
if ((iDelta % 2 != 0) && (k >= (iDelta-(D-1)) && k <= (iDelta+(D-1))))
{
//If the path overlaps the furthest reaching reverse (D-1)-path in diagonal k.
if (m_vecForward[k] >= m_vecReverse[k])
{
//The last snake of the forward path is the middle snake.
iPathK = k;
iPathEndX = m_vecForward[k];
iPathStartX = iPathEndX;
int iPathStartY = iPathStartX - iPathK;
while (iPathStartX > 0 && iPathStartY > 0 && A[iPathStartX].CompareTo(B[iPathStartY]) == 0)
{
iPathStartX--;
iPathStartY--;
}
//Length of an SES is 2D-1.
return 2*D-1;
}
}
}
for (int k = -D; k <= D; k+=2)
{
//Find the end of the furthest reaching reverse D=path in diagonal k+iDelta
GetReverseDPaths(A, B, D, k, iDelta);
//If iDelta is even and ...
if ((iDelta % 2 == 0) && ((k+iDelta) >= -D && (k+iDelta) <= D))
{
//If the path overlaps the furthest reaching forward D-path in diagonal k+iDelta.
if (m_vecReverse[k+iDelta] <= m_vecForward[k+iDelta])
{
//The last snake of the reverse path is the middle snake.
iPathK = k+iDelta;
iPathStartX = m_vecReverse[iPathK];
iPathEndX = iPathStartX;
int iPathEndY = iPathEndX - iPathK;
while(iPathEndX < A.Length && iPathEndY < B.Length && A[iPathEndX+1].CompareTo(B[iPathEndY+1]) == 0)
{
iPathEndX++;
iPathEndY++;
}
//Length of an SES is 2D.
return 2*D;
}
}
}
}
//We should never get here if the algorithm is coded correctly.
Debug.Assert(false);
return -1;
}
private void GetLCS(SubArray A, SubArray B, List Output)
{
if (A.Length > 0 && B.Length > 0)
{
//Find the length D and the middle snake from (x,y) to (u,v)
int x, u, k;
int D = FindMiddleSnake(A, B, out x, out u, out k);
int y = x - k;
int v = u - k;
if (D > 1)
{
GetLCS(new SubArray(A, 1, x), new SubArray(B, 1, y), Output);
for (int i = x+1; i <= u; i++)
{
Output.Add(A[i]);
}
GetLCS(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), Output);
}
else if (B.Length > A.Length)
{
for (int i = 1; i <= A.Length; i++)
{
Output.Add(A[i]);
}
}
else
{
for (int i = 1; i <= B.Length; i++)
{
Output.Add(B[i]);
}
}
}
}
private void GetMatchPoints(SubArray A, SubArray B, List MatchPoints)
{
if (A.Length > 0 && B.Length > 0)
{
//Find the middle snake from (x,y) to (u,v)
int x, u, k;
int D = FindMiddleSnake(A, B, out x, out u, out k);
int y = x - k;
int v = u - k;
if (D > 1)
{
GetMatchPoints(new SubArray(A, 1, x), new SubArray(B, 1, y), MatchPoints);
for (int i = x+1; i <= u; i++)
{
//Output absolute X and Y (not relative to the current subarray)
MatchPoints.Add(new Point(i + A.Offset, i-k + B.Offset));
}
GetMatchPoints(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), MatchPoints);
}
else
{
//If there are no differences, we have to output all of the points.
//If there's only one difference, we have to output all of the
//match points, skipping the single point that is different.
Debug.Assert(D == 0 || Math.Abs(A.Length-B.Length) == 1, "A and B's lengths must differ by 1 if D == 1");
//Only go to the minimum of the two lengths since that's the
//most that can possibly match between the two subsequences.
int N = A.Length;
int M = B.Length;
if (M > N)
{
//Output A[1..N] as match points
int iCurrY = 1;
for (int i = 1; i <= N; i++)
{
//We must skip the one difference when we hit it
if (A[i].CompareTo(B[iCurrY]) != 0)
{
iCurrY++;
}
MatchPoints.Add(new Point(i + A.Offset, iCurrY + B.Offset));
iCurrY++;
}
}
else
{
//Output B[1..M] as match points
int iCurrX = 1;
for (int i = 1; i <= M; i++)
{
//We must skip the one difference when we hit it
if (A[iCurrX].CompareTo(B[i]) != 0)
{
iCurrX++;
}
MatchPoints.Add(new Point(iCurrX + A.Offset, i + B.Offset));
iCurrX++;
}
}
}
}
}
private EditScript ConvertMatchPointsToEditScript(int N, int M, List MatchPoints)
{
EditScript Script = new EditScript();
int iCurrX = 1;
int iCurrY = 1;
//Add a fictitious match point at (N+1, M+1) so we're guaranteed to
//pick up all edits with a single loop.
MatchPoints.Add(new Point(N+1, M+1));
//NOTE: When we create new Edit instances, we'll store iCurrX and iCurrY
//minus 1 because we want to convert them back to 0-based indexes for
//the user. The user shouldn't have to know that internally we use any
//1-based types.
foreach (Point Pt in MatchPoints)
{
int iMatchX = Pt.X;
int iMatchY = Pt.Y;
//A one-to-one grouping of inserts and deletes will be considered a change.
if (m_bSupportChangeEditType && iCurrX < iMatchX && iCurrY < iMatchY)
{
int iChangeLength = Math.Min(iMatchX-iCurrX, iMatchY-iCurrY);
Script.Add(new Edit(EditType.Change, iCurrX - 1, iCurrY - 1, iChangeLength));
iCurrX += iChangeLength;
iCurrY += iChangeLength;
}
if (iCurrX < iMatchX)
{
Script.Add(new Edit(EditType.Delete, iCurrX-1, iCurrY-1, iMatchX - iCurrX));
}
if (iCurrY < iMatchY)
{
Script.Add(new Edit(EditType.Insert, iCurrX-1, iCurrY-1, iMatchY - iCurrY));
}
iCurrX = iMatchX + 1;
iCurrY = iMatchY + 1;
}
return Script;
}
#endregion
#region Private Member Variables
private T[] m_arA; //Sequence A
private T[] m_arB; //Sequence B
private DiagonalVector m_vecForward;
private DiagonalVector m_vecReverse;
private bool m_bSupportChangeEditType;
#endregion
}
}