Initial revision

2003-07-14 12:22:29 +00:00 · 2003-07-14 12:22:29 +00:00 · 06fb77a82b
commit 06fb77a82b
parent f900154e7a
19 changed files with 3139 additions and 0 deletions
--- a/source/org/apache/commons/jrcs/diff/SimpleDiff.java
+++ b/source/org/apache/commons/jrcs/diff/SimpleDiff.java
@ -0,0 +1,315 @@
+/*
+ * ====================================================================
+ *
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 1999-2003 The Apache Software Foundation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if
+ *    any, must include the following acknowlegement:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowlegement may appear in the software itself,
+ *    if and wherever such third-party acknowlegements normally appear.
+ *
+ * 4. The names "The Jakarta Project", "Commons", and "Apache Software
+ *    Foundation" must not be used to endorse or promote products derived
+ *    from this software without prior written permission. For written
+ *    permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache"
+ *    nor may "Apache" appear in their names without prior written
+ *    permission of the Apache Group.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.commons.jrcs.diff;
+
+import java.util.*;
+
+/**
+ * Implements a simple differencing algortithm.<p>
+ *
+ * @date $Date: 2003/07/14 12:22:29 $
+ * @version $Revision: 1.1 $
+ * @author <a href="mailto:juanco@suigeneris.org">Juanco Anez</a>
+ *
+ * <p><b>Overview of Algorithm</b></p>
+ *
+ * <p><i>by <a
+ * href='http://www.topmeadow.net/bwm'> bwm</a>
+ * </p>
+ *
+ * <p>The algorithm is optimised for situations where the input sequences
+ * have few repeated objects.  If it is given input with many repeated
+ * objects it will report sub-optimal changes.  However, given appropriate
+ * input, it is fast, and linear in memory usage.</p>
+ *
+ * <p>The algorithm consists of the following steps:</p>
+ * <ul>
+ *   <li>compute an equivalence set for the input data</li>
+ *   <li>translate each element of the orginal
+ *       and revised input sequences to a member of the equivalence set
+ *   </li>
+ *   <li>match the the input sequences to determine the deltas, i.e.
+ *       the differences between the original and revised sequences.</li>
+ * </ul>
+ *
+ * <p>The first step is to compute a an equivalence set for the input data.
+ *  The equivalence set is computed from objects that are in the original
+ *  input sequence</p>
+ * <pre>
+ *   eq(x) = the index of the first occurence of x in the original sequence.
+ * </pre>
+ *
+ * <p>With this equivalence function, the algorithm can compare integers rather
+ * than strings, which is considerably more efficient.</p>
+ *
+ * <p>The second step is to compute the datastructure on which the
+ * algorithm will operate.  Having computed the equivalence function
+ * in the previous step, we can compute two arrays where
+ * indx[i] = eqs(orig[i]) and jndx[i] = eqs(rev[i]).  The algorithm can
+ * now operate on indx and jndx instead of orig and rev. Thus, comparisons
+ * are then on O(int == int) instead of O(Object.equals(Object)).
+ * </p>
+ *
+ * <p>The algorithm now matches indx and jndx.  Whilst indx[i] == jndx[i]
+ * it skips matching objects in the sequence.  In seeking to match objects
+ * in the input sequence it assumes that each object is likely to be unique.
+ * It uses the known characteristics of the unique equivalence function.  It can
+ * tell from the eq value if this object appeared in the other sequence
+ * at all.  If it did not, there is no point in searching for a match.</p>
+ *
+ * <p>Recall that the eq function value is the index earliest occurrence in
+ * the orig sequence.  This information is used to search efficiently for
+ * the next match.  The algorithm is perfect when all input objects are
+ * unique, but degrades when input objects are not unique.  When input
+ * objects are not unique an optimal match may not be found, but a
+ * correct match will be.</p>
+ *
+ * <p>Having identified common matching objects in the orig and revised
+ * sequences, the differences between them are easily computed.
+ * </p>
+ *
+ * @see Delta
+ * @see Revision
+ * Modifications:
+ *
+ * 27/Apr/2003 bwm
+ * Added some comments whilst trying to figure out the algorithm
+ *
+ * 03 May 2003 bwm
+ * Created this implementation class by refactoring it out of the Diff
+ * class to enable plug in difference algorithms
+ *
+ */
+public class SimpleDiff
+    implements DiffAlgorithm
+{
+
+    static final int NOT_FOUND_i = -2;
+    static final int NOT_FOUND_j = -1;
+    static final int EOS = Integer.MAX_VALUE;
+
+    public SimpleDiff()
+    {
+    }
+
+    protected int scan(int[] ndx, int i, int target)
+    {
+        while (ndx[i] < target)
+        {
+            i++;
+        }
+        return i;
+    }
+
+    /**
+     * Compute the difference between original and revised sequences.
+     *
+     * @param orig The original sequence.
+     * @param rev The revised sequence to be compared with the original.
+     * @return A Revision object describing the differences.
+     * @throws DifferenciationFailedException if the diff could not be computed.
+     */
+    public Revision diff(Object[] orig, Object[] rev)
+        throws DifferentiationFailedException
+    {
+        // create map eqs, such that for each item in both orig and rev
+        // eqs(item) = firstOccurrence(item, orig);
+        Map eqs = buildEqSet(orig, rev);
+
+        // create an array such that
+        //   indx[i] = NOT_FOUND_i if orig[i] is not in rev
+        //   indx[i] = firstOccurrence(orig[i], orig)
+        int[] indx = buildIndex(eqs, orig, NOT_FOUND_i);
+
+        // create an array such that
+        //   jndx[j] = NOT_FOUND_j if orig[j] is not in rev
+        //   jndx[j] = firstOccurrence(rev[j], orig)
+        int[] jndx = buildIndex(eqs, rev, NOT_FOUND_j);
+
+        // what in effect has been done is to build a unique hash
+        // for each item that is in both orig and rev
+        // and to label each item in orig and new with that hash value
+        // or a marker that the item is not common to both.
+
+        eqs = null; // let gc know we're done with this
+
+        Revision deltas = new Revision(); //!!! new Revision()
+        int i = 0;
+        int j = 0;
+
+        // skip matching
+        // skip leading items that are equal
+        // could be written
+        // for (i=0; indx[i] != EOS && indx[i] == jndx[i]; i++);
+        // j = i;
+        for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
+        {
+            /* void */
+        }
+
+        while (indx[i] != jndx[j])
+        { // only equal if both == EOS
+            // they are different
+            int ia = i;
+            int ja = j;
+
+            // size of this delta
+            do
+            {
+                // look down rev for a match
+                // stop at a match
+                // or if the FO(rev[j]) > FO(orig[i])
+                // or at the end
+                while (jndx[j] < 0 || jndx[j] < indx[i])
+                {
+                    j++;
+                }
+                // look down orig for a match
+                // stop at a match
+                // or if the FO(orig[i]) > FO(rev[j])
+                // or at the end
+                while (indx[i] < 0 || indx[i] < jndx[j])
+                {
+                    i++;
+                }
+
+                // this doesn't do a compare each line with each other line
+                // so it won't find all matching lines
+            }
+            while (indx[i] != jndx[j]);
+
+            // on exit we have a match
+
+            // they are equal, reverse any exedent matches
+            // it is possible to overshoot, so count back matching items
+            while (i > ia && j > ja && indx[i - 1] == jndx[j - 1])
+            {
+                --i;
+                --j;
+            }
+
+            deltas.addDelta(Delta.newDelta(new Chunk(orig, ia, i - ia),
+                                           new Chunk(rev, ja, j - ja)));
+            // skip matching
+            for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
+            {
+                /* void */
+            }
+        }
+        return deltas;
+    }
+
+    /**
+     * create a <code>Map</code> from each common item in orig and rev
+     * to the index of its first occurrence in orig
+     *
+     * @param orig the original sequence of items
+     * @param rev  the revised sequence of items
+     */
+    protected Map buildEqSet(Object[] orig, Object[] rev)
+    {
+        // construct a set of the objects that orig and rev have in common
+
+        // first construct a set containing all the elements in orig
+        Set items = new HashSet(Arrays.asList(orig));
+
+        // then remove all those not in rev
+        items.retainAll(Arrays.asList(rev));
+
+        Map eqs = new HashMap();
+        for (int i = 0; i < orig.length; i++)
+        {
+            // if its a common item and hasn't been found before
+            if (items.contains(orig[i]))
+            {
+                // add it to the map
+                eqs.put(orig[i], new Integer(i));
+                // and make sure its not considered again
+                items.remove(orig[i]);
+            }
+        }
+        return eqs;
+    }
+
+    /**
+     * build a an array such each a[i] = eqs([i]) or NF if eqs([i]) undefined
+     *
+     * @param eqs a mapping from Object to Integer
+     * @param seq a sequence of objects
+     * @param NF  the not found marker
+     */
+    protected int[] buildIndex(Map eqs, Object[] seq, int NF)
+    {
+        int[] result = new int[seq.length + 1];
+        for (int i = 0; i < seq.length; i++)
+        {
+            Integer value = (Integer) eqs.get(seq[i]);
+            if (value == null || value.intValue() < 0)
+            {
+                result[i] = NF;
+            }
+            else
+            {
+                result[i] = value.intValue();
+            }
+        }
+        result[seq.length] = EOS;
+        return result;
+    }
+
+}