#region Copyright And Revision History /*--------------------------------------------------------------------------- MyersDiff.cs Copyright © 2002 Bill Menees. All rights reserved. Bill@Menees.com Who When What ------- ---------- ----------------------------------------------------- BMenees 10.20.2002 Created. -----------------------------------------------------------------------------*/ #endregion using System; using System.Diagnostics; using System.Drawing; using System.Collections.Generic; namespace Menees.DiffUtils { /// /// This class implements the differencing algorithm from /// "An O(ND) Difference Algorithm And Its Variations" by /// Eugene W. Myers. It is the standard algorithm used by /// the UNIX diff utilities. /// /// This implementation diffs two comparable arrays. It is /// typically used with arrays of hash values where each /// hash corresponds to a line of text. Then it can be /// used to diff two text files on a line-by-line basis. /// public class MyersDiff where T : IComparable { #region Constructors public MyersDiff(T[] arA, T[] arB, bool bSupportChangeEditType) { m_arA = arA; m_arB = arB; m_bSupportChangeEditType = bSupportChangeEditType; int N = arA.Length; int M = arB.Length; m_vecForward = new DiagonalVector(N, M); m_vecReverse = new DiagonalVector(N, M); } #endregion #region Public Methods /// /// Returns an EditScript instance that gives all the Edits /// necessary to transform A into B. /// public EditScript Execute() { List MatchPoints = new List(); SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); GetMatchPoints(A, B, MatchPoints); Debug.Assert(MatchPoints.Count == GetLCSLength(), "The number of match points must equal the LCS length."); EditScript Script = ConvertMatchPointsToEditScript(A.Length, B.Length, MatchPoints); Debug.Assert(Script.TotalEditLength == GetSESLength()); return Script; } /// /// Returns the longest common subsequence from A and B. /// public IList GetLCS() { List Output = new List(); GetLCS(new SubArray(m_arA), new SubArray(m_arB), Output); return Output; } /// /// Calculates the length that the LCS should be without /// actually determining the LCS. /// public int GetLCSLength() { //Per Myers's paper, we should always have //D+2L == N+M. So L == (N+M-D)/2. return (m_arA.Length + m_arB.Length - GetSESLength()) / 2; } /// /// Returns a similary index between 0 and 1 inclusive. /// 0 means A and B are completely different. 1 means /// they are exactly alike. The similarity index is /// calculated as twice the length of the LCS divided /// by the sum of A and B's lengths. /// public double GetSimilarity() { return (2.0*GetLCSLength())/(double)(m_arA.Length + m_arB.Length); } /// /// Calculates the length of the "shortest edit script" /// as defined in Myers's paper. /// /// Note: This may not be the same as the Count property /// of an EditScript instance returned by Execute(). If /// an EditScript instance has any Edits with Length > 1, /// then those groupings will make EditScript.Count less /// than GetSESLength(). Similarly, an Edit with EditType /// Change should be thought of as a combined Delete and /// Insert for the specified Length. /// public int GetSESLength() { SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); if (SetupFictitiousPoints(A, B)) { int N = m_arA.Length; int M = m_arB.Length; for (int D = 0; D <= N+M; D++) { for (int k = -D; k <= D; k+=2) { int x = GetForwardDPaths(A, B, D, k); int y = x - k; if (x >= N && y >= M) { return D; } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; } else if (m_arA.Length == 0) { return m_arB.Length; } else { return m_arA.Length; } } /// /// Gets the length of the "shortest edit script" /// by running the algorithm in reverse. We should /// always have GetSESLength() == GetReverseSESLength(). /// public int GetReverseSESLength() { SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); if (SetupFictitiousPoints(A, B)) { int N = m_arA.Length; int M = m_arB.Length; int iDelta = N-M; for (int D = 0; D <= N+M; D++) { for (int k = -D; k <= D; k+=2) { int x = GetReverseDPaths(A, B, D, k, iDelta); int y = x - (k+iDelta); if (x <= 0 && y <= 0) { return D; } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; } else if (m_arA.Length == 0) { return m_arB.Length; } else { return m_arA.Length; } } #endregion #region Private Methods private bool SetupFictitiousPoints(SubArray A, SubArray B) { if (A.Length > 0 && B.Length > 0) { //Setup some "fictious" endpoints for initial forward //and reverse path navigation. m_vecForward[1] = 0; int N = A.Length; int M = B.Length; int iDelta = N-M; m_vecReverse[iDelta + 1] = N + 1; return true; } else return false; } private int GetForwardDPaths(SubArray A, SubArray B, int D, int k) { DiagonalVector V = m_vecForward; int x; if ((k == -D) || (k != D && V[k-1] < V[k+1])) { x = V[k+1]; } else { x = V[k-1]+1; } int y = x - k; while (x < A.Length && y < B.Length && A[x+1].CompareTo(B[y+1]) == 0) { x++; y++; } V[k] = x; return x; } private int GetReverseDPaths(SubArray A, SubArray B, int D, int k, int iDelta) { DiagonalVector V = m_vecReverse; int p = k + iDelta; int x; if ((k == -D) || (k != D && V[p+1] <= V[p-1])) { x = V[p+1]-1; } else { x = V[p-1]; } int y = x - p; while (x > 0 && y > 0 && A[x].CompareTo(B[y]) == 0) { x--; y--; } V[p] = x; return x; } private int FindMiddleSnake(SubArray A, SubArray B, out int iPathStartX, out int iPathEndX, out int iPathK) { //We don't have to check the result of this because the calling procedure //has already check the length preconditions. SetupFictitiousPoints(A, B); iPathStartX = -1; iPathEndX = -1; iPathK = 0; int iDelta = A.Length - B.Length; int iCeiling = (int)Math.Ceiling((A.Length + B.Length) / 2.0); for (int D = 0; D <= iCeiling; D++) { for (int k = -D; k <= D; k+=2) { //Find the end of the furthest reaching forward D-path in diagonal k. GetForwardDPaths(A, B, D, k); //If iDelta is odd (i.e. remainder == 1 or -1) and ... if ((iDelta % 2 != 0) && (k >= (iDelta-(D-1)) && k <= (iDelta+(D-1)))) { //If the path overlaps the furthest reaching reverse (D-1)-path in diagonal k. if (m_vecForward[k] >= m_vecReverse[k]) { //The last snake of the forward path is the middle snake. iPathK = k; iPathEndX = m_vecForward[k]; iPathStartX = iPathEndX; int iPathStartY = iPathStartX - iPathK; while (iPathStartX > 0 && iPathStartY > 0 && A[iPathStartX].CompareTo(B[iPathStartY]) == 0) { iPathStartX--; iPathStartY--; } //Length of an SES is 2D-1. return 2*D-1; } } } for (int k = -D; k <= D; k+=2) { //Find the end of the furthest reaching reverse D=path in diagonal k+iDelta GetReverseDPaths(A, B, D, k, iDelta); //If iDelta is even and ... if ((iDelta % 2 == 0) && ((k+iDelta) >= -D && (k+iDelta) <= D)) { //If the path overlaps the furthest reaching forward D-path in diagonal k+iDelta. if (m_vecReverse[k+iDelta] <= m_vecForward[k+iDelta]) { //The last snake of the reverse path is the middle snake. iPathK = k+iDelta; iPathStartX = m_vecReverse[iPathK]; iPathEndX = iPathStartX; int iPathEndY = iPathEndX - iPathK; while(iPathEndX < A.Length && iPathEndY < B.Length && A[iPathEndX+1].CompareTo(B[iPathEndY+1]) == 0) { iPathEndX++; iPathEndY++; } //Length of an SES is 2D. return 2*D; } } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; } private void GetLCS(SubArray A, SubArray B, List Output) { if (A.Length > 0 && B.Length > 0) { //Find the length D and the middle snake from (x,y) to (u,v) int x, u, k; int D = FindMiddleSnake(A, B, out x, out u, out k); int y = x - k; int v = u - k; if (D > 1) { GetLCS(new SubArray(A, 1, x), new SubArray(B, 1, y), Output); for (int i = x+1; i <= u; i++) { Output.Add(A[i]); } GetLCS(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), Output); } else if (B.Length > A.Length) { for (int i = 1; i <= A.Length; i++) { Output.Add(A[i]); } } else { for (int i = 1; i <= B.Length; i++) { Output.Add(B[i]); } } } } private void GetMatchPoints(SubArray A, SubArray B, List MatchPoints) { if (A.Length > 0 && B.Length > 0) { //Find the middle snake from (x,y) to (u,v) int x, u, k; int D = FindMiddleSnake(A, B, out x, out u, out k); int y = x - k; int v = u - k; if (D > 1) { GetMatchPoints(new SubArray(A, 1, x), new SubArray(B, 1, y), MatchPoints); for (int i = x+1; i <= u; i++) { //Output absolute X and Y (not relative to the current subarray) MatchPoints.Add(new Point(i + A.Offset, i-k + B.Offset)); } GetMatchPoints(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), MatchPoints); } else { //If there are no differences, we have to output all of the points. //If there's only one difference, we have to output all of the //match points, skipping the single point that is different. Debug.Assert(D == 0 || Math.Abs(A.Length-B.Length) == 1, "A and B's lengths must differ by 1 if D == 1"); //Only go to the minimum of the two lengths since that's the //most that can possibly match between the two subsequences. int N = A.Length; int M = B.Length; if (M > N) { //Output A[1..N] as match points int iCurrY = 1; for (int i = 1; i <= N; i++) { //We must skip the one difference when we hit it if (A[i].CompareTo(B[iCurrY]) != 0) { iCurrY++; } MatchPoints.Add(new Point(i + A.Offset, iCurrY + B.Offset)); iCurrY++; } } else { //Output B[1..M] as match points int iCurrX = 1; for (int i = 1; i <= M; i++) { //We must skip the one difference when we hit it if (A[iCurrX].CompareTo(B[i]) != 0) { iCurrX++; } MatchPoints.Add(new Point(iCurrX + A.Offset, i + B.Offset)); iCurrX++; } } } } } private EditScript ConvertMatchPointsToEditScript(int N, int M, List MatchPoints) { EditScript Script = new EditScript(); int iCurrX = 1; int iCurrY = 1; //Add a fictitious match point at (N+1, M+1) so we're guaranteed to //pick up all edits with a single loop. MatchPoints.Add(new Point(N+1, M+1)); //NOTE: When we create new Edit instances, we'll store iCurrX and iCurrY //minus 1 because we want to convert them back to 0-based indexes for //the user. The user shouldn't have to know that internally we use any //1-based types. foreach (Point Pt in MatchPoints) { int iMatchX = Pt.X; int iMatchY = Pt.Y; //A one-to-one grouping of inserts and deletes will be considered a change. if (m_bSupportChangeEditType && iCurrX < iMatchX && iCurrY < iMatchY) { int iChangeLength = Math.Min(iMatchX-iCurrX, iMatchY-iCurrY); Script.Add(new Edit(EditType.Change, iCurrX - 1, iCurrY - 1, iChangeLength)); iCurrX += iChangeLength; iCurrY += iChangeLength; } if (iCurrX < iMatchX) { Script.Add(new Edit(EditType.Delete, iCurrX-1, iCurrY-1, iMatchX - iCurrX)); } if (iCurrY < iMatchY) { Script.Add(new Edit(EditType.Insert, iCurrX-1, iCurrY-1, iMatchY - iCurrY)); } iCurrX = iMatchX + 1; iCurrY = iMatchY + 1; } return Script; } #endregion #region Private Member Variables private T[] m_arA; //Sequence A private T[] m_arB; //Sequence B private DiagonalVector m_vecForward; private DiagonalVector m_vecReverse; private bool m_bSupportChangeEditType; #endregion } }