Source: suffixtree.js

/**  
 * @brief Module for creating a suffix tree out of an input text and 
 * getting important features from it for creating an arc diagram
 *
 * @file suffixtree.cpp
 * @author David Pfahler
 * @author Matthias Gusenbauer
 *
 * @long Module for creating a suffix tree out of an input text and 
 * getting important features from it for creating an arc diagram
 *
 * suffix tree adapted but enhanced from:
 * Copyright (C) 2012 Eike Send 
 * 
 * http://eike.se/nd
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */


/** Node object constructor for the suffixtree construction
 * @constructor
 */
Node = function(){
  this.value = ""; /** Node string value */
  this.leaves = []; /** Leaves(suffixes) of the node */
  this.nodes = []; /** Childnodes of a node */
  this.leavePositions = []; /** Startpositions of the leaves in the string */
  this.nodePositions = []; /** Startpositions of the childnodes */
};

/** Arc object constructor for maximum matchin pair generation
 * @constructor
 * @param sourcePos First starting position of the arc
 * @param numberEle Length of the arc in characters
 * @param targetPos Second starting position of the arc
 * @param text String value of the arc
 * @return Instance of an Arc object
 */
Arc = function(sourcePos,numberEle,targetPos,text) {
  this.sourcePos = sourcePos; /** First starting position of the arc */
  this.numberEle = numberEle; /** Length of the arc in characters */
  this.targetPos = targetPos; /** Second starting position of the arc */
  this.text      = text; /** String value of the arc */
};

/** Pair object constructor for pairs in the tree
 * @constructor
 * @param str the value of pair
 * @param pos the starting positions in text
 */
Pair = function(str, pos){
  this.pattern = new String(str); /** value of pair */
  this.positions = pos; /**  starting positions in text */
  this.length = str.length; /**  length of the text */
};

/** Method to update the position of the pattern of a pair with a new letter 
 * @param letter the letter that gets concated to the pattern
 */
Pair.prototype.update = function(letter){
    this.pattern = this.pattern.concat(letter);
    this.length = this.pattern.length;
    // meant for multiple pairs per node
    for (var i = 0; i < this.positions.length; i++) {
        this.positions[i]--;
    }
};

/** Method to test for node splitting in Suffixtree generation
 * @param suf Current suffix in the tree generation
 * @param pos Position of the current suffx in the complete string
 * @return boolean whether to split a node or not
 */
Node.prototype.checkNodes = function(suf, pos) {
  var node;
  for (var i = 0; i < this.nodes.length; i++) {
    node = this.nodes[i];
    if (node.value == suf[0]) { // if first letter in current node and suffix matches
      node.addSuffix(suf.slice(1), pos+1); // split and take new suffix - begin again - MAGIC: node creation here - save position of string
      node.nodePositions.push(pos);
      return true; // has updated
    }
  }
  return false;
};

/** Method to test leaves for matching beginning and node insertion
 * @param suf Current suffix in the tree generation
 * @param pos Position of the current suffx in the complete string
 * @return none
 */
Node.prototype.checkLeaves = function(suf, pos) {
  var node, leaf;
  for (var i = 0; i < this.leaves.length; i++) {
    leaf = this.leaves[i];
    if (leaf[0] == suf[0]) { // if first letter matches leave
      node = new Node(); // create new (inner) node - MAGIC: node creation here - save position of string
      node.value = leaf[0];
      node.addSuffix(suf.slice(1), pos+1); // add suffix of string
      var oldPos = this.leavePositions.splice(i, 1)[0];
      node.addSuffix(leaf.slice(1), oldPos+1); // add suffix of previous leave
      //node.nodePositions.push(pos-1);
      node.nodePositions.push(oldPos);
      node.nodePositions.push(pos);
      this.nodes.push(node); // add new node to current node
      this.leaves.splice(i,1);
      //this.nodePositions.push(this.leavePositions.splice(i, 1));
      return;
    }
  }
  this.leaves.push(suf); // if no suffix match - add new leave node to current node
  this.leavePositions.push(pos);
};

/** Method to add a suffix to the current tree
 * @param suf Current suffix in the tree generation
 * @param pos Position of the current suffx in the complete string
 * @return none
 */
Node.prototype.addSuffix = function(suf, pos) {
  if (!suf.length) return; // last suffix
  if (!this.checkNodes(suf, pos)) { // inner: check all nodes if contains suffix
    this.checkLeaves(suf, pos); // outer: if no update in nodes -> check all leavees if contains suffix
  }
};

/** Method to retrieve the longest repeated substring from the suffixtree
 * @return The longest of the repeated substrings in the suffixtree
 */
Node.prototype.getLongestRepeatedSubString = function() {
  var str = "";
  var temp = "";
  for (var i = 0; i < this.nodes.length; i++) {
    temp = this.nodes[i].getLongestRepeatedSubString();
    if (temp.length > str.length) {
      str = temp;
    }
  }
  return this.value + str;
};

/** Method to retrieve the longest repeated substrings from the suffixtree
 * @return All pairs of repeated substrings
 */
Node.prototype.getLongestRepeatedSubStrings = function() {
    var pairs = [];
    var temp = [];
    
    // Add pair for each position of node
    /*for (var i = 0; i < this.nodePositions.length; i++) {
        pairs.push(new Pair(this.value, this.nodePositions));
    }*/
    if (this.nodePositions.length > 0) {
        pairs.push(new Pair(this.value, this.nodePositions));
    }
    
    // Iterate over subnodes
    for (var i = 0; i < this.nodes.length; i++) {
        temp = this.nodes[i].getLongestRepeatedSubStrings();
        
        for (var j = 0; j < temp.length; j++) {
            
            if (this.value.length !== 0) { //Not only length check but also check for Definition 1. 
                temp[j].update(this.value);
            }
            pairs.push(temp[j]);
        }
    }
    
    return pairs;
};

/** Method to create an HTML representation of the suffix tree
 * @return HTML representation of the suffixtree
 */
Node.prototype.toHTML = function() {
  var html = "<div class=node>";
  if (this.value.length) {
    html += "<h3>"+this.value+"</h3>";
  }
  if (this.nodes.length) {
    html += "<h4>nodes</h4><ul>";
    for (var i = 0; i < this.nodes.length; i++) {
      html += "<li>" + this.nodes[i].toHTML() + "</li>";
    }
    html += "</ul>";
  }
  if (this.leaves.length) {
    html += "<h4>leaves</h4><ul>";
    for (var i = 0; i < this.leaves.length; i++) {
      html += "<li>" + this.leaves[i] + "</li>";
    }
    html += "</ul>";
  }
  return html;
};

/** Method to create an HTML representation of the suffix tree
 * @constructor
 * @return Suffixtree object
 */
SuffixTree = function(str) {
  this.node = new Node(); /** Root node of suffixtree */
  for (var i = 0; i < str.length; i++) {
    this.node.addSuffix(str.slice(i), i);
  }
};

/** Method to retrieve all repetitionregions of a tree according to definition 2. from the paper
 * @param input The input text string to parse for repetition regions
 * @return All repetition regions in the suffixtree
 */
SuffixTree.prototype.getRepRegion = function(input) {
    var regions = [];
      var n = input.length;
      var s = 0;
      while (s < n-1)
      {
        for (var l = 1; l < (n-s)/2+1;l++)
        {
          var candidate = input.substring(s,s+l);
          var end = s+l;
          while (input.substring(end,end+l).indexOf(candidate) >= 0)
            end += l;

          // not enough pairs
          if(end === s + l)
            continue;

          // not left most
          var bOk = false;
          tuple = [s-1,end-1,candidate.length]
          for(var i = 0; i < regions.length && !bOk; i++)
          {
            var r = regions[i];
            if(r[0] === t[0] && r[1] === t[1] && r[2].length === t[2])
              bOk = true;
          }
          if(bOk)
            continue;

          // not minimal
          bOk = false;
          for(var i = 0; i < regions.length && !bOk; i++)
          {
            r = regions[i];
            if(r[0] <= s && r[1] >= end && r[2].length <= candidate.length)
              bOk = true;
          }
          if(bOk)
            continue;

          // perfekt
          r = [s,end,candidate];
          regions.push(r);
        }
        s++;
      }
    
    return regions;
}

/** Method to retrieve all essential matching pairs of a tree according to definition 3. from the paper
 * @param mmPairs The maximal matching pairs of the tree 
 * @param regions The repetition regions of the tree
 * @return All essential matching pairs from the suffix tree
 */
SuffixTree.prototype.getEssentialMatchinPairs = function(mmPairs, regions) {
    var emPairs = [];

      for (var i = 0; i < mmPairs.length; i++) {
        var a = mmPairs[i];
          // definition 3.1 
          // A maximal matching pair not contained in any repetition region
          bOk = true;
          for (var j = 0; j < regions.length && bOk; j++) {
            r = regions[j];
            if ((a.sourcePos >= r[0]) && 
                (a.targetPos + a.numberEle <= r[1]) ||
                 // definition 3.2
                 // Or, a maximal matching pair contained in the same fundamental substring of any repetition region that contains it
                (Math.floor((a.sourcePos - r[0])/r[2].length) == Math.floor((a.targetPos + a.numberEle - r[0] - 1)/r[2].length)))
              bOk = false;
          }
          if(bOk)
            emPairs.push(a);
      }
      // definition 3.3
      // Or, two consecutive fundamental substrings for a repetition region.
      for (var i = regions.length - 1; i >= 0; i--) {
        var r = regions[i];
        var text = r[2];
        var step = text.length;
        stop = r[1] - step;
        for (var j=r[0]; j < stop; j += step){
            emPairs.push(new Arc(j,step,j+step,text));
        }
      }
    
    return emPairs;
}

/** Method to retrieve all maximal matching pairs of a tree according to definition 1. from the paper
 * @return All maximal matching pairs from the tree
 */
SuffixTree.prototype.getMaximalMatchingPairs = function() {

    // Definition 1.1 Identical.X and Y consist of the same sequence of symbols.
    var pairs = this.node.getLongestRepeatedSubStrings();

    // sort the pairs by their string length
    pairs.sort(function(a,b){
      if( a.length < b.length) return 1;
      else if (a.length > b.length) return -1;
      else if (a.positions.length < b.positions.length) return 1;
      else if (a.positions.length > b.positions.length) return -1; 
      else return 0; }
      );
    // and the positions by their occurences
    for (i = 0; i < pairs.length; i++) {
      pairs[i].positions.sort(function(a,b){return (a - b)});
    }    

    //  Definition 1.2 Non-overlapping X and Y do not intersect.
    for (i = 0; i < pairs.length; i++) {
      var pair = pairs[i];
      // eliminate all overlapping within a pattern
      for(s = 0; s < pair.positions.length; s++) {
        // the next position is not allowed to be within the length of the pattern (overlapping)
        for(t = s+1; t < pair.positions.length; t++) {
          if(pair.positions[s] + pair.length > pair.positions[t]) {
            pair.positions.splice(t--,1);
          }
        }
      }
      if(pair.positions.length <= 1)
        pairs.splice(i--,1);
    }

    // create the arcs
    var arcs = [];
    for (i = 0; i < pairs.length; i++) {
      var pair = pairs[i];
      for(j = 0; j < pair.positions.length - 1; j++) {
        // reverse the text of the pattern (because of internal suffix tree representation)
        var text = pair.pattern.split("").reverse().join("")
        arcs.push(new Arc(pair.positions[j], pair.length, pair.positions[j+1], text));
      }
    }

    // iterate over the filtered arcs and remove the ones that are not:
    //  * identical - it is impossible that they are identical here
    //  * Consecutive
    //  * Maximal
    for(i = 0; i < arcs.length; i++)
    {
      var arc = arcs[i];
      for(j = 0; j < arcs.length; j++)
      {
        if( i === j )
          continue;
        var other = arcs[j];
        // Not maximal test:
        // the start position of this arc is within of an other
        // and the end position of this arc is not within this other
        if ((arc.sourcePos >= other.sourcePos) && 
          (arc.sourcePos + arc.numberEle) <= (other.sourcePos + other.numberEle) && 
          (arc.targetPos >= (other.sourcePos + other.numberEle))) {
            arcs.splice(i--,1);
            break;
          }
      }
    }

    return arcs;
};