316 lines
11 KiB
Java
316 lines
11 KiB
Java
|
/*
|
||
|
* ====================================================================
|
||
|
*
|
||
|
* The Apache Software License, Version 1.1
|
||
|
*
|
||
|
* Copyright (c) 1999-2003 The Apache Software Foundation.
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
*
|
||
|
* 1. Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
*
|
||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in
|
||
|
* the documentation and/or other materials provided with the
|
||
|
* distribution.
|
||
|
*
|
||
|
* 3. The end-user documentation included with the redistribution, if
|
||
|
* any, must include the following acknowlegement:
|
||
|
* "This product includes software developed by the
|
||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||
|
* Alternately, this acknowlegement may appear in the software itself,
|
||
|
* if and wherever such third-party acknowlegements normally appear.
|
||
|
*
|
||
|
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
|
||
|
* Foundation" must not be used to endorse or promote products derived
|
||
|
* from this software without prior written permission. For written
|
||
|
* permission, please contact apache@apache.org.
|
||
|
*
|
||
|
* 5. Products derived from this software may not be called "Apache"
|
||
|
* nor may "Apache" appear in their names without prior written
|
||
|
* permission of the Apache Group.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||
|
* SUCH DAMAGE.
|
||
|
* ====================================================================
|
||
|
*
|
||
|
* This software consists of voluntary contributions made by many
|
||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||
|
* information on the Apache Software Foundation, please see
|
||
|
* <http://www.apache.org/>.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
package org.apache.commons.jrcs.diff;
|
||
|
|
||
|
import java.util.*;
|
||
|
|
||
|
/**
|
||
|
* Implements a simple differencing algortithm.<p>
|
||
|
*
|
||
|
* @date $Date: 2003/07/14 12:22:29 $
|
||
|
* @version $Revision: 1.1 $
|
||
|
* @author <a href="mailto:juanco@suigeneris.org">Juanco Anez</a>
|
||
|
*
|
||
|
* <p><b>Overview of Algorithm</b></p>
|
||
|
*
|
||
|
* <p><i>by <a
|
||
|
* href='http://www.topmeadow.net/bwm'> bwm</a>
|
||
|
* </p>
|
||
|
*
|
||
|
* <p>The algorithm is optimised for situations where the input sequences
|
||
|
* have few repeated objects. If it is given input with many repeated
|
||
|
* objects it will report sub-optimal changes. However, given appropriate
|
||
|
* input, it is fast, and linear in memory usage.</p>
|
||
|
*
|
||
|
* <p>The algorithm consists of the following steps:</p>
|
||
|
* <ul>
|
||
|
* <li>compute an equivalence set for the input data</li>
|
||
|
* <li>translate each element of the orginal
|
||
|
* and revised input sequences to a member of the equivalence set
|
||
|
* </li>
|
||
|
* <li>match the the input sequences to determine the deltas, i.e.
|
||
|
* the differences between the original and revised sequences.</li>
|
||
|
* </ul>
|
||
|
*
|
||
|
* <p>The first step is to compute a an equivalence set for the input data.
|
||
|
* The equivalence set is computed from objects that are in the original
|
||
|
* input sequence</p>
|
||
|
* <pre>
|
||
|
* eq(x) = the index of the first occurence of x in the original sequence.
|
||
|
* </pre>
|
||
|
*
|
||
|
* <p>With this equivalence function, the algorithm can compare integers rather
|
||
|
* than strings, which is considerably more efficient.</p>
|
||
|
*
|
||
|
* <p>The second step is to compute the datastructure on which the
|
||
|
* algorithm will operate. Having computed the equivalence function
|
||
|
* in the previous step, we can compute two arrays where
|
||
|
* indx[i] = eqs(orig[i]) and jndx[i] = eqs(rev[i]). The algorithm can
|
||
|
* now operate on indx and jndx instead of orig and rev. Thus, comparisons
|
||
|
* are then on O(int == int) instead of O(Object.equals(Object)).
|
||
|
* </p>
|
||
|
*
|
||
|
* <p>The algorithm now matches indx and jndx. Whilst indx[i] == jndx[i]
|
||
|
* it skips matching objects in the sequence. In seeking to match objects
|
||
|
* in the input sequence it assumes that each object is likely to be unique.
|
||
|
* It uses the known characteristics of the unique equivalence function. It can
|
||
|
* tell from the eq value if this object appeared in the other sequence
|
||
|
* at all. If it did not, there is no point in searching for a match.</p>
|
||
|
*
|
||
|
* <p>Recall that the eq function value is the index earliest occurrence in
|
||
|
* the orig sequence. This information is used to search efficiently for
|
||
|
* the next match. The algorithm is perfect when all input objects are
|
||
|
* unique, but degrades when input objects are not unique. When input
|
||
|
* objects are not unique an optimal match may not be found, but a
|
||
|
* correct match will be.</p>
|
||
|
*
|
||
|
* <p>Having identified common matching objects in the orig and revised
|
||
|
* sequences, the differences between them are easily computed.
|
||
|
* </p>
|
||
|
*
|
||
|
* @see Delta
|
||
|
* @see Revision
|
||
|
* Modifications:
|
||
|
*
|
||
|
* 27/Apr/2003 bwm
|
||
|
* Added some comments whilst trying to figure out the algorithm
|
||
|
*
|
||
|
* 03 May 2003 bwm
|
||
|
* Created this implementation class by refactoring it out of the Diff
|
||
|
* class to enable plug in difference algorithms
|
||
|
*
|
||
|
*/
|
||
|
public class SimpleDiff
|
||
|
implements DiffAlgorithm
|
||
|
{
|
||
|
|
||
|
static final int NOT_FOUND_i = -2;
|
||
|
static final int NOT_FOUND_j = -1;
|
||
|
static final int EOS = Integer.MAX_VALUE;
|
||
|
|
||
|
public SimpleDiff()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
protected int scan(int[] ndx, int i, int target)
|
||
|
{
|
||
|
while (ndx[i] < target)
|
||
|
{
|
||
|
i++;
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Compute the difference between original and revised sequences.
|
||
|
*
|
||
|
* @param orig The original sequence.
|
||
|
* @param rev The revised sequence to be compared with the original.
|
||
|
* @return A Revision object describing the differences.
|
||
|
* @throws DifferenciationFailedException if the diff could not be computed.
|
||
|
*/
|
||
|
public Revision diff(Object[] orig, Object[] rev)
|
||
|
throws DifferentiationFailedException
|
||
|
{
|
||
|
// create map eqs, such that for each item in both orig and rev
|
||
|
// eqs(item) = firstOccurrence(item, orig);
|
||
|
Map eqs = buildEqSet(orig, rev);
|
||
|
|
||
|
// create an array such that
|
||
|
// indx[i] = NOT_FOUND_i if orig[i] is not in rev
|
||
|
// indx[i] = firstOccurrence(orig[i], orig)
|
||
|
int[] indx = buildIndex(eqs, orig, NOT_FOUND_i);
|
||
|
|
||
|
// create an array such that
|
||
|
// jndx[j] = NOT_FOUND_j if orig[j] is not in rev
|
||
|
// jndx[j] = firstOccurrence(rev[j], orig)
|
||
|
int[] jndx = buildIndex(eqs, rev, NOT_FOUND_j);
|
||
|
|
||
|
// what in effect has been done is to build a unique hash
|
||
|
// for each item that is in both orig and rev
|
||
|
// and to label each item in orig and new with that hash value
|
||
|
// or a marker that the item is not common to both.
|
||
|
|
||
|
eqs = null; // let gc know we're done with this
|
||
|
|
||
|
Revision deltas = new Revision(); //!!! new Revision()
|
||
|
int i = 0;
|
||
|
int j = 0;
|
||
|
|
||
|
// skip matching
|
||
|
// skip leading items that are equal
|
||
|
// could be written
|
||
|
// for (i=0; indx[i] != EOS && indx[i] == jndx[i]; i++);
|
||
|
// j = i;
|
||
|
for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
|
||
|
{
|
||
|
/* void */
|
||
|
}
|
||
|
|
||
|
while (indx[i] != jndx[j])
|
||
|
{ // only equal if both == EOS
|
||
|
// they are different
|
||
|
int ia = i;
|
||
|
int ja = j;
|
||
|
|
||
|
// size of this delta
|
||
|
do
|
||
|
{
|
||
|
// look down rev for a match
|
||
|
// stop at a match
|
||
|
// or if the FO(rev[j]) > FO(orig[i])
|
||
|
// or at the end
|
||
|
while (jndx[j] < 0 || jndx[j] < indx[i])
|
||
|
{
|
||
|
j++;
|
||
|
}
|
||
|
// look down orig for a match
|
||
|
// stop at a match
|
||
|
// or if the FO(orig[i]) > FO(rev[j])
|
||
|
// or at the end
|
||
|
while (indx[i] < 0 || indx[i] < jndx[j])
|
||
|
{
|
||
|
i++;
|
||
|
}
|
||
|
|
||
|
// this doesn't do a compare each line with each other line
|
||
|
// so it won't find all matching lines
|
||
|
}
|
||
|
while (indx[i] != jndx[j]);
|
||
|
|
||
|
// on exit we have a match
|
||
|
|
||
|
// they are equal, reverse any exedent matches
|
||
|
// it is possible to overshoot, so count back matching items
|
||
|
while (i > ia && j > ja && indx[i - 1] == jndx[j - 1])
|
||
|
{
|
||
|
--i;
|
||
|
--j;
|
||
|
}
|
||
|
|
||
|
deltas.addDelta(Delta.newDelta(new Chunk(orig, ia, i - ia),
|
||
|
new Chunk(rev, ja, j - ja)));
|
||
|
// skip matching
|
||
|
for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
|
||
|
{
|
||
|
/* void */
|
||
|
}
|
||
|
}
|
||
|
return deltas;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* create a <code>Map</code> from each common item in orig and rev
|
||
|
* to the index of its first occurrence in orig
|
||
|
*
|
||
|
* @param orig the original sequence of items
|
||
|
* @param rev the revised sequence of items
|
||
|
*/
|
||
|
protected Map buildEqSet(Object[] orig, Object[] rev)
|
||
|
{
|
||
|
// construct a set of the objects that orig and rev have in common
|
||
|
|
||
|
// first construct a set containing all the elements in orig
|
||
|
Set items = new HashSet(Arrays.asList(orig));
|
||
|
|
||
|
// then remove all those not in rev
|
||
|
items.retainAll(Arrays.asList(rev));
|
||
|
|
||
|
Map eqs = new HashMap();
|
||
|
for (int i = 0; i < orig.length; i++)
|
||
|
{
|
||
|
// if its a common item and hasn't been found before
|
||
|
if (items.contains(orig[i]))
|
||
|
{
|
||
|
// add it to the map
|
||
|
eqs.put(orig[i], new Integer(i));
|
||
|
// and make sure its not considered again
|
||
|
items.remove(orig[i]);
|
||
|
}
|
||
|
}
|
||
|
return eqs;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* build a an array such each a[i] = eqs([i]) or NF if eqs([i]) undefined
|
||
|
*
|
||
|
* @param eqs a mapping from Object to Integer
|
||
|
* @param seq a sequence of objects
|
||
|
* @param NF the not found marker
|
||
|
*/
|
||
|
protected int[] buildIndex(Map eqs, Object[] seq, int NF)
|
||
|
{
|
||
|
int[] result = new int[seq.length + 1];
|
||
|
for (int i = 0; i < seq.length; i++)
|
||
|
{
|
||
|
Integer value = (Integer) eqs.get(seq[i]);
|
||
|
if (value == null || value.intValue() < 0)
|
||
|
{
|
||
|
result[i] = NF;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
result[i] = value.intValue();
|
||
|
}
|
||
|
}
|
||
|
result[seq.length] = EOS;
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
}
|