[mlpack] 10/20: Significant refactoring for clarity.

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu May 25 20:44:09 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 2c4c1ea9c7fae0743b3d13153ce9ad9e41b27989
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed May 3 17:19:16 2017 -0400

    Significant refactoring for clarity.
---
 .../tree/rectangle_tree/r_star_tree_split_impl.hpp | 848 ++++++---------------
 1 file changed, 253 insertions(+), 595 deletions(-)

diff --git a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
index 51ddfea..3813936 100644
--- a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
@@ -33,30 +33,12 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
   typedef typename TreeType::ElemType ElemType;
   typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;
 
+  // If there's no need to split, don't.
   if (tree->Count() <= tree->MaxLeafSize())
     return;
-//  std::cout << "split leaf node " << tree << " with parent " << tree->Parent()
-//<< "\n";
-
-  // If we are splitting the root node, we need will do things differently so
-  // that the constructor and other methods don't confuse the end user by giving
-  // an address of another node.
-//  if (tree->Parent() == NULL)
-//  {
-    // We actually want to copy this way.  Pointers and everything.
-//    TreeType* copy = new TreeType(*tree, false);
-//    copy->Parent() = tree;
-//    tree->Count() = 0;
-//    tree->NullifyData();
-    // Because this was a leaf node, numChildren must be 0.
-//    tree->children[(tree->NumChildren())++] = copy;
-//    assert(tree->NumChildren() == 1);
-
-//    RStarTreeSplit::SplitLeafNode(copy,relevels);
-//    return;
-//  }
-
-  // If we haven't yet reinserted on this level, we try doing so now.
+
+  // If we haven't yet checked if we need to reinsert on this level, we try
+  // doing so now.
   if (relevels[tree->TreeDepth() - 1])
   {
     relevels[tree->TreeDepth() - 1] = false;
@@ -67,132 +49,121 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
     while (root->Parent() != NULL)
       root = root->Parent();
     size_t p = tree->MaxLeafSize() * 0.3; // The paper says this works the best.
-    if (p == 0)
-    {
-      RStarTreeSplit::SplitLeafNode(tree,relevels);
-      return;
-    }
-
-    std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
-    arma::Col<ElemType> center;
-    tree->Bound().Center(center); // Modifies centroid.
-    for (size_t i = 0; i < sorted.size(); i++)
+    if (p > 0)
     {
-      sorted[i].first = tree->Metric().Evaluate(center,
-          tree->Dataset().col(tree->Point(i)));
-      sorted[i].second = i;
-    }
+      // We'll only do reinsertions if p > 0.  p is the number of points we will
+      // reinsert.  If p == 0, then no reinsertion is necessary and we continue
+      // with the splitting procedure.
+      std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
+      arma::Col<ElemType> center;
+      tree->Bound().Center(center);
+      for (size_t i = 0; i < sorted.size(); i++)
+      {
+        sorted[i].first = tree->Metric().Evaluate(center,
+            tree->dataset->col(tree->Point(i)));
+        sorted[i].second = tree->Point(i);
+      }
 
-    std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
-    std::vector<size_t> pointIndices(sorted.size());
+      std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
 
-    for (size_t i = 0; i < p; i++)
-    {
-      // We start from the end of sorted.
-      pointIndices[i] = tree->Point(sorted[sorted.size() - 1 - i].second);
+      // Remove the points furthest from the center of the node.
+      for (size_t i = 0; i < p; i++)
+        root->DeletePoint(sorted[sorted.size() - 1 - i].second, relevels);
 
-      root->DeletePoint(tree->Point(sorted[sorted.size() - 1 - i].second),
-          relevels);
-    }
+      // Now reinsert the points, but reverse the order---insert the closest to
+      // the center first.
+      for (size_t i = p; i > 0; --i)
+        root->InsertPoint(sorted[sorted.size() - i].second, relevels);
 
-    for (size_t i = 0; i < p; i++)
-    {
-      // We reverse the order again to reinsert the closest points first.
-      root->InsertPoint(pointIndices[p - 1 - i], relevels);
+      // Any reinsertions take care of splitting.
+      return;
     }
-
-    return;
   }
 
-  int bestOverlapIndexOnBestAxis = 0;
-  int bestAreaIndexOnBestAxis = 0;
-  bool tiedOnOverlap = false;
-  int bestAxis = 0;
-  ElemType bestAxisScore = std::numeric_limits<ElemType>::max();
+  // We don't need to reinsert.  Instead, we need to split the node.
+  size_t bestAxis = 0;
+  size_t bestSplitIndex = 0;
+  ElemType bestScore = std::numeric_limits<ElemType>::max();
 
+  /**
+   * Check each dimension, to find which dimension is best to split on.
+   */
   for (size_t j = 0; j < tree->Bound().Dim(); j++)
   {
     ElemType axisScore = 0.0;
-    // Since we only have points in the leaf nodes, we only need to sort once.
-    std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
-    for (size_t i = 0; i < sorted.size(); i++)
-    {
-      sorted[i].first = tree->Dataset().col(tree->Point(i))[j];
-      sorted[i].second = i;
-    }
 
-    std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
+    // Sort in increasing values of the selected dimension j.
+    arma::Col<ElemType> dimValues(tree->Count());
+    for (size_t i = 0; i < tree->Count(); ++i)
+      dimValues[i] = tree->Dataset().col(tree->Point(i))[j];
+    arma::uvec sortedIndices = arma::sort_index(dimValues);
 
     // We'll store each of the three scores for each distribution.
-    std::vector<ElemType> areas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() +
-        2);
-    std::vector<ElemType> margins(tree->MaxLeafSize() - 2 * tree->MinLeafSize() +
-        2);
-    std::vector<ElemType> overlapedAreas(tree->MaxLeafSize() -
-        2 * tree->MinLeafSize() + 2);
-
-    for (size_t i = 0; i < areas.size(); i++)
-    {
-      areas[i] = 0.0;
-      margins[i] = 0.0;
-      overlapedAreas[i] = 0.0;
-    }
+    const size_t numPossibleSplits = tree->MaxLeafSize() -
+        2 * tree->MinLeafSize() + 2;
+    arma::Col<ElemType> areas(numPossibleSplits, arma::fill::zeros);
+    arma::Col<ElemType> margins(numPossibleSplits, arma::fill::zeros);
+    arma::Col<ElemType> overlaps(numPossibleSplits, arma::fill::zeros);
 
-    for (size_t i = 0; i < areas.size(); i++)
+    for (size_t i = 0; i < numPossibleSplits; i++)
     {
       // The ith arrangement is obtained by placing the first
       // tree->MinLeafSize() + i points in one rectangle and the rest in
       // another.  Then we calculate the three scores for that distribution.
-
-      size_t cutOff = tree->MinLeafSize() + i;
+      size_t splitIndex = tree->MinLeafSize() + i;
 
       BoundType bound1(tree->Bound().Dim());
       BoundType bound2(tree->Bound().Dim());
 
-      for (size_t l = 0; l < cutOff; l++)
-        bound1 |= tree->Dataset().col(tree->Point(sorted[l].second));
+      for (size_t l = 0; l < splitIndex; l++)
+        bound1 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
 
-      for (size_t l = cutOff; l < tree->Count(); l++)
-        bound2 |= tree->Dataset().col(tree->Point(sorted[l].second));
+      for (size_t l = splitIndex; l < tree->Count(); l++)
+        bound2 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
 
-      ElemType area1 = bound1.Volume();
-      ElemType area2 = bound2.Volume();
-      ElemType oArea = bound1.Overlap(bound2);
+      areas[i] = bound1.Volume() + bound2.Volume();
+      overlaps[i] = bound1.Overlap(bound2);
 
       for (size_t k = 0; k < bound1.Dim(); k++)
         margins[i] += bound1[k].Width() + bound2[k].Width();
 
-      areas[i] += area1 + area2;
-      overlapedAreas[i] += oArea;
       axisScore += margins[i];
     }
 
-    if (axisScore < bestAxisScore)
+    // Is this dimension a new best score?  We want the lowest possible score.
+    if (axisScore < bestScore)
     {
-      bestAxisScore = axisScore;
+      bestScore = axisScore;
       bestAxis = j;
-      bestOverlapIndexOnBestAxis = 0;
-      bestAreaIndexOnBestAxis = 0;
+      size_t overlapIndex = 0;
+      size_t areaIndex = 0;
+      bool tiedOnOverlap = false;
 
-      for (size_t i = 1; i < areas.size(); i++)
+      for (size_t i = 1; i < areas.n_elem; i++)
       {
-        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
+        if (overlaps[i] < overlaps[overlapIndex])
         {
           tiedOnOverlap = false;
-          bestAreaIndexOnBestAxis = i;
-          bestOverlapIndexOnBestAxis = i;
+          overlapIndex = i;
+          areaIndex = i;
         }
-        else if (overlapedAreas[i] ==
-            overlapedAreas[bestOverlapIndexOnBestAxis])
+        else if (overlaps[i] == overlaps[overlapIndex])
         {
           tiedOnOverlap = true;
-          if (areas[i] < areas[bestAreaIndexOnBestAxis])
-            bestAreaIndexOnBestAxis = i;
+          if (areas[i] < areas[areaIndex])
+            areaIndex = i;
         }
       }
+
+      // Select the best index for splitting.
+      bestSplitIndex = (tiedOnOverlap ? areaIndex : overlapIndex);
     }
   }
 
+  /**
+   * Now that we have found the best dimension to split on, re-sort in that
+   * dimension to prepare for reinsertion of points into the new nodes.
+   */
   std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
   for (size_t i = 0; i < sorted.size(); i++)
   {
@@ -202,128 +173,57 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
 
   std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
 
-  if (tree->Parent())
+  /**
+   * If 'tree' is the root of the tree (i.e. if it has no parent), then we must
+   * create two new child nodes, distribute the points from the original node
+   * among them, and insert those.  If 'tree' is not the root of the tree, then
+   * we may create only one new child node, redistribute the points from the
+   * original node between 'tree' and the new node, then insert those nodes into
+   * the parent.
+   *
+   * Here we simply set treeOne and treeTwo to the right values to avoid code
+   * duplication.
+   */
+  TreeType* par = tree->Parent();
+  TreeType* treeOne = (par) ? tree              : new TreeType(tree);
+  TreeType* treeTwo = (par) ? new TreeType(par) : new TreeType(tree);
+
+  // Now clean the node, and we will re-use this.
+  const size_t numPoints = tree->Count();
+
+  // Reset the original node's values, regardless of whether it will become
+  // the new parent or not.
+  tree->numChildren = 0;
+  tree->numDescendants = 0;
+  tree->count = 0;
+  tree->bound.Clear();
+
+  // Insert the points into the appropriate tree.
+  for (size_t i = 0; i < numPoints; i++)
   {
-//    std::cout << "first check parent " << tree->Parent() << " numDescendants "
-//<< tree->Parent()->NumDescendants() <<
-//" with " << tree->Parent()->NumChildren() << " children\n";
-//    size_t manualCount = 0;
-//    for (size_t i = 0; i < tree->Parent()->NumChildren(); ++i)
-//    {
-//      std::cout << "(" << &(tree->Parent()->Child(i)) << ") ";
-//      manualCount += tree->Parent()->Child(i).NumDescendants();
-//      std::cout << tree->Parent()->Child(i).NumDescendants() << " ";
-//    }
-//    std::cout << "\n";
-  //  TreeType* treeOne = new TreeType(tree->Parent());
-    // Now clean the node, and we will re-use this.
-    const size_t oldDescendants = tree->numDescendants;
-    const size_t numPoints = tree->count;
-    tree->numChildren = 0;
-    tree->numDescendants = 0;
-    tree->bound.Clear();
-    tree->count = 0;
-    tree->begin = 0;
-
-    TreeType* treeTwo = new TreeType(tree->Parent());
-
-    if (tiedOnOverlap)
-    {
-      for (size_t i = 0; i < numPoints; i++)
-      {
-        if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
-          tree->InsertPoint(sorted[i].second);
-        else
-          treeTwo->InsertPoint(sorted[i].second);
-      }
-    }
+    if (i < bestSplitIndex + tree->MinLeafSize())
+      treeOne->InsertPoint(sorted[i].second);
     else
-    {
-      for (size_t i = 0; i < numPoints; i++)
-      {
-        if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
-          tree->InsertPoint(sorted[i].second);
-        else
-          treeTwo->InsertPoint(sorted[i].second);
-      }
-    }
+      treeTwo->InsertPoint(sorted[i].second);
+  }
 
-    // Insert the new tree node.
-    TreeType* par = tree->Parent();
+  // Insert the new tree node(s).
+  if (par)
+  {
+    // Just insert the new node into the parent.
     par->children[par->NumChildren()++] = treeTwo;
 
-    // We only add one at a time, so we should only need to test for equality
-    // just in case, we use an assert.
-//    std::cout << "x: " << oldDescendants << " == " << tree->NumDescendants() <<
-//" + " << treeTwo->NumDescendants() << " (" << tree->NumDescendants() +
-//treeTwo->NumDescendants() << ")\n";
-    assert(oldDescendants == tree->NumDescendants() + treeTwo->NumDescendants());
-//    std::cout << "check parent " << par << " numDescendants " << par->NumDescendants() <<
-//" with " << par->NumChildren() << " children\n";
-//    manualCount = 0;
-//    for (size_t i = 0; i < par->NumChildren(); ++i)
-//    {
-//      std::cout << "(" << &(par->Child(i)) << ") ";
-//      manualCount += par->Child(i).NumDescendants();
-//      std::cout << par->Child(i).NumDescendants() << " ";
-//    }
-//    std::cout << "\n";
-//    assert(par->NumDescendants() == manualCount);
-    assert(par->NumChildren() <= par->MaxNumChildren() + 1);
+    // If we have overflowed the parent's children, then we need to split that
+    // node also.
     if (par->NumChildren() == par->MaxNumChildren() + 1)
       RStarTreeSplit::SplitNonLeafNode(par, relevels);
-
-    assert(tree->Parent()->NumChildren() <= tree->MaxNumChildren());
-    assert(tree->Parent()->NumChildren() >= tree->MinNumChildren());
-    assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
-    assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
   }
   else
   {
-    TreeType* treeOne = new TreeType(tree);
-    TreeType* treeTwo = new TreeType(tree);
-
-    const size_t oldNumDescendants = tree->NumDescendants();
-    const size_t numPoints = tree->Count();
-    tree->numChildren = 0;
-    tree->bound.Clear();
-    tree->count = 0;
-    tree->begin = 0;
-    tree->numDescendants = 0;
-
-    if (tiedOnOverlap)
-    {
-      for (size_t i = 0; i < numPoints; ++i)
-      {
-        if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
-          treeOne->InsertPoint(sorted[i].second);
-        else
-          treeTwo->InsertPoint(sorted[i].second);
-      }
-    }
-    else
-    {
-      for (size_t i = 0; i < numPoints; ++i)
-      {
-        if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
-          treeOne->InsertPoint(sorted[i].second);
-        else
-          treeTwo->InsertPoint(sorted[i].second);
-      }
-    }
-
+    // Now insert the two nodes into 'tree', which is now a higher-level root
+    // node in the tree.
     InsertNodeIntoTree(tree, treeOne);
     InsertNodeIntoTree(tree, treeTwo);
-
-//    std::cout << "y: " << oldNumDescendants << ": " << treeOne->NumDescendants()
-//<< " + " << treeTwo->NumDescendants() << " (" << tree->NumDescendants() <<
-//")\n";
-//    std::cout << "tree left " << treeOne->NumDescendants() << ", right " <<
-//treeTwo->NumDescendants() << ", parent " << tree->NumDescendants() << "\n";
-//    for (size_t i = 0; i < tree->NumChildren(); ++i)
-//      std::cout << tree->Child(i).NumDescendants() << " ";
-//    std::cout << " (that's the children of the parent)\n";
-    assert(oldNumDescendants == tree->NumDescendants());
   }
 }
 
@@ -337,435 +237,205 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
 template<typename TreeType>
 bool RStarTreeSplit::SplitNonLeafNode(TreeType *tree,std::vector<bool>& relevels)
 {
-//  std::cout << "split nonleaf node " << tree << " with parent " <<
-//tree->Parent() << "\n";
   // Convenience typedef.
   typedef typename TreeType::ElemType ElemType;
   typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;
 
-  // If we are splitting the root node, we need will do things differently so
-  // that the constructor and other methods don't confuse the end user by giving
-  // an address of another node.
-//  if (tree->Parent() == NULL)
-//  {
-    // We actually want to copy this way.  Pointers and everything.
-//    TreeType* copy = new TreeType(*tree, false);
-
-//    copy->Parent() = tree;
-//    tree->NumChildren() = 0;
-//    tree->NullifyData();
-//    tree->children[(tree->NumChildren())++] = copy;
-
-//    RStarTreeSplit::SplitNonLeafNode(copy,relevels);
-//    return true;
-//  }
-
- /*
-  // If we haven't yet reinserted on this level, we try doing so now.
-  if (relevels[tree->TreeDepth()]) {
-    relevels[tree->TreeDepth()] = false;
-    // We sort the points by decreasing centroid to centroid distance.
-    // We then remove the first p entries and reinsert them at the root.
-    RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>* root = tree;
-    while(root->Parent() != NULL)
-      root = root->Parent();
-    size_t p = tree->MaxNumChildren() * 0.3; // The paper says this works the best.
-    if (p == 0) {
-      SplitNonLeafNode(tree, relevels);
-      return false;
-    }
+  // Reinsertion isn't done for non-leaf nodes; the paper doesn't seem to make
+  // it clear how to reinsert an entire node without reinserting each of the
+  // points, so we will avoid that here.  This is a possible area for
+  // improvement of this code.
 
-    std::vector<sortStruct> sorted(tree->NumChildren());
-    arma::vec c1;
-    tree->Bound().Center(c1); // Modifies c1.
-    for(size_t i = 0; i < sorted.size(); i++) {
-      arma::vec c2;
-      tree->Children()[i]->Bound().Center(c2); // Modifies c2.
-      sorted[i].d = tree->Bound().Metric().Evaluate(c1,c2);
-      sorted[i].n = i;
-    }
-    std::sort(sorted.begin(), sorted.end(), structComp);
-
-    //bool startBelowMinFill = tree->NumChildren() < tree->MinNumChildren();
+  size_t bestAxis = 0;
+  size_t bestIndex = 0;
+  ElemType bestScore = std::numeric_limits<ElemType>::max();
+  bool lowIsBetter = true;
 
-    std::vector<RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>*> removedNodes(p);
-    for(size_t i =0; i < p; i++) {
-      removedNodes[i] = tree->Children()[sorted[sorted.size()-1-i].n]; // We start from the end of sorted.
-      root->RemoveNode(tree->Children()[sorted[sorted.size()-1-i].n], relevels);
-    }
-
-    for(size_t i = 0; i < p; i++) {
-      root->InsertNode(removedNodes[p-1-i], tree->TreeDepth(), relevels); // We reverse the order again to reinsert the closest nodes first.
-    }
-
-    // If we went below min fill, delete this node and reinsert all children.
-    //SOMETHING IS WRONG.  SHOULD NOT GO BELOW MIN FILL.
-//    if (!startBelowMinFill && tree->NumChildren() < tree->MinNumChildren())
-//    std::cout<<"MINFILLERROR "<< p << ", " << tree->NumChildren() << "; " << tree->MaxNumChildren()<<std::endl;
-
-//    if (tree->NumChildren() < tree->MinNumChildren()) {
-//      std::vector<RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>*> rmNodes(tree->NumChildren());
-//      for(size_t i = 0; i < rmNodes.size(); i++) {
-//        rmNodes[i] = tree->Children()[i];
-//      }
-//      root->RemoveNode(tree, relevels);
-//      for(size_t i = 0; i < rmNodes.size(); i++) {
-//        root->InsertNode(rmNodes[i], tree->TreeDepth(), relevels);
-//      }
-// //      tree->SoftDelete();
-//    }
- //    assert(tree->NumChildren() >= tree->MinNumChildren());
- //    assert(tree->NumChildren() <= tree->MaxNumChildren());
-
-   return false;
- }
-
-*/
-
-  int bestOverlapIndexOnBestAxis = 0;
-  int bestAreaIndexOnBestAxis = 0;
-  bool tiedOnOverlap = false;
-  bool lowIsBest = true;
-  int bestAxis = 0;
-  ElemType bestAxisScore = std::numeric_limits<ElemType>::max();
+  /**
+   * Check over each dimension to see which is best to use for splitting.
+   */
   for (size_t j = 0; j < tree->Bound().Dim(); j++)
   {
-    ElemType axisScore = 0.0;
+    ElemType axisLoScore = 0.0;
+    ElemType axisHiScore = 0.0;
 
-    // We'll do Bound().Lo() now and use Bound().Hi() later.
-    std::vector<std::pair<ElemType, size_t>> sorted(tree->NumChildren());
-    for (size_t i = 0; i < sorted.size(); i++)
-    {
-      sorted[i].first = tree->Child(i).Bound()[j].Lo();
-      sorted[i].second = i;
-    }
-
-    std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
-
-    // We'll store each of the three scores for each distribution.
-    std::vector<ElemType> areas(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-    std::vector<ElemType> margins(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-    std::vector<ElemType> overlapedAreas(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-
-    for (size_t i = 0; i < areas.size(); i++)
+    // We have to calculate values for both the lower and higher parts of the
+    // bound.
+    arma::Col<ElemType> loDimValues(tree->NumChildren());
+    arma::Col<ElemType> hiDimValues(tree->NumChildren());
+    for (size_t i = 0; i < tree->NumChildren(); i++)
     {
-      areas[i] = 0.0;
-      margins[i] = 0.0;
-      overlapedAreas[i] = 0.0;
+      loDimValues[i] = tree->Child(i).Bound()[j].Lo();
+      hiDimValues[i] = tree->Child(i).Bound()[j].Hi();
     }
-
-    for (size_t i = 0; i < areas.size(); i++)
+    arma::uvec sortedLoDimIndices = arma::sort_index(loDimValues);
+    arma::uvec sortedHiDimIndices = arma::sort_index(hiDimValues);
+
+    // We'll store each of the three scores for each distribution.  Remember
+    // that these are the sums calculated over both the low and high bounds of
+    // each rectangle.
+    const size_t numPossibleSplits = tree->MaxNumChildren() -
+        2 * tree->MinNumChildren() + 2;
+    arma::Col<ElemType> areas(2 * numPossibleSplits, arma::fill::zeros);
+    arma::Col<ElemType> margins(2 * numPossibleSplits, arma::fill::zeros);
+    arma::Col<ElemType> overlaps(2 * numPossibleSplits, arma::fill::zeros);
+
+    for (size_t i = 0; i < numPossibleSplits; ++i)
     {
       // The ith arrangement is obtained by placing the first
       // tree->MinNumChildren() + i points in one rectangle and the rest in
       // another.  Then we calculate the three scores for that distribution.
+      const size_t splitIndex = tree->MinNumChildren() + i;
 
-      size_t cutOff = tree->MinNumChildren() + i;
-
-      BoundType bound1(tree->Bound().Dim());
-      BoundType bound2(tree->Bound().Dim());
-
-      for (size_t l = 0; l < cutOff; l++)
-        bound1 |= tree->Child(sorted[l].second).Bound();
+      BoundType lb1(tree->Bound().Dim());
+      BoundType lb2(tree->Bound().Dim());
+      BoundType hb1(tree->Bound().Dim());
+      BoundType hb2(tree->Bound().Dim());
 
-      for (size_t l = cutOff; l < tree->NumChildren(); l++)
-        bound2 |= tree->Child(sorted[l].second).Bound();
-
-      ElemType area1 = bound1.Volume();
-      ElemType area2 = bound2.Volume();
-      ElemType oArea = bound1.Overlap(bound2);
-
-      for (size_t k = 0; k < bound1.Dim(); k++)
-        margins[i] += bound1[k].Width() + bound2[k].Width();
-
-      areas[i] += area1 + area2;
-      overlapedAreas[i] += oArea;
-      axisScore += margins[i];
-    }
-
-    if (axisScore < bestAxisScore)
-    {
-      bestAxisScore = axisScore;
-      bestAxis = j;
-      bestOverlapIndexOnBestAxis = 0;
-      bestAreaIndexOnBestAxis = 0;
-      for (size_t i = 1; i < areas.size(); i++)
+      for (size_t l = 0; l < splitIndex; ++l)
       {
-        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
-        {
-          tiedOnOverlap = false;
-          bestAreaIndexOnBestAxis = i;
-          bestOverlapIndexOnBestAxis = i;
-        }
-        else if (overlapedAreas[i] ==
-            overlapedAreas[bestOverlapIndexOnBestAxis])
-        {
-          tiedOnOverlap = true;
-          if (areas[i] < areas[bestAreaIndexOnBestAxis])
-            bestAreaIndexOnBestAxis = i;
-        }
+        lb1 |= tree->Child(sortedLoDimIndices[l]).Bound();
+        hb1 |= tree->Child(sortedHiDimIndices[l]).Bound();
+      }
+      for (size_t l = splitIndex; l < tree->NumChildren(); ++l)
+      {
+        lb2 |= tree->Child(sortedLoDimIndices[l]).Bound();
+        hb2 |= tree->Child(sortedHiDimIndices[l]).Bound();
       }
-    }
-  }
 
-  // Now we do the same thing using Bound().Hi() and choose the best of the two.
-  for (size_t j = 0; j < tree->Bound().Dim(); j++)
-  {
-    ElemType axisScore = 0.0;
+      // Calculate low bound distributions.
+      areas[2 * i] = lb1.Volume() + lb2.Volume();
+      overlaps[2 * i] = lb1.Overlap(lb2);
 
-    std::vector<std::pair<ElemType, size_t>> sorted(tree->NumChildren());
-    for (size_t i = 0; i < sorted.size(); i++)
-    {
-      sorted[i].first = tree->Child(i).Bound()[j].Hi();
-      sorted[i].second = i;
-    }
+      // Calculate high bound distributions.
+      areas[2 * i + 1] = hb1.Volume() + hb2.Volume();
+      overlaps[2 * i + 1] = hb1.Overlap(hb2);
 
-    std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
+      // Now calculate margins for each.
+      for (size_t k = 0; k < lb1.Dim(); k++)
+      {
+        margins[2 * i]     += lb1[k].Width() + lb2[k].Width();
+        margins[2 * i + 1] += hb1[k].Width() + hb2[k].Width();
+      }
 
-    // We'll store each of the three scores for each distribution.
-    std::vector<ElemType> areas(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-    std::vector<ElemType> margins(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-    std::vector<ElemType> overlapedAreas(tree->MaxNumChildren() -
-        2 * tree->MinNumChildren() + 2);
-
-    for (size_t i = 0; i < areas.size(); i++)
-    {
-      areas[i] = 0.0;
-      margins[i] = 0.0;
-      overlapedAreas[i] = 0.0;
+      // The score we use is the sum of all scores.
+      axisLoScore += margins[2 * i];
+      axisHiScore += margins[2 * i + 1];
     }
 
-    for (size_t i = 0; i < areas.size(); i++)
+    // If this dimension's score (for lower or higher bound scores) is a new
+    // best, then extract the necessary split information.
+    if (std::min(axisLoScore, axisHiScore) < bestScore)
     {
-      // The ith arrangement is obtained by placing the first
-      // tree->MinNumChildren() + i points in one rectangle and the rest in
-      // another.  Then we calculate the three scores for that distribution.
+      bestScore = std::min(axisLoScore, axisHiScore);
+      if (axisLoScore < axisHiScore)
+        lowIsBetter = true;
+      else
+        lowIsBetter = false;
 
-      size_t cutOff = tree->MinNumChildren() + i;
-
-      BoundType bound1(tree->Bound().Dim());
-      BoundType bound2(tree->Bound().Dim());
-
-      for (size_t l = 0; l < cutOff; l++)
-        bound1 |= tree->Child(sorted[l].second).Bound();
-
-      for (size_t l = cutOff; l < tree->NumChildren(); l++)
-        bound2 |= tree->Child(sorted[l].second).Bound();
-
-      ElemType area1 = bound1.Volume();
-      ElemType area2 = bound2.Volume();
-      ElemType oArea = bound1.Overlap(bound2);
-
-      for (size_t k = 0; k < bound1.Dim(); k++)
-        margins[i] += bound1[k].Width() + bound2[k].Width();
+      bestAxis = j;
 
-      areas[i] += area1 + area2;
-      overlapedAreas[i] += oArea;
-      axisScore += margins[i];
-    }
+      // This will get us either the lower or higher bound depending on which we
+      // want.  Remember that we selected *either* the lower or higher bounds to
+      // split on, so we want to only check those.
+      const size_t indexOffset = lowIsBetter ? 0 : 1;
 
-    if (axisScore < bestAxisScore)
-    {
-      bestAxisScore = axisScore;
-      bestAxis = j;
-      lowIsBest = false;
-      bestOverlapIndexOnBestAxis = 0;
-      bestAreaIndexOnBestAxis = 0;
+      size_t overlapIndex = indexOffset;
+      size_t areaIndex = indexOffset;
+      bool tiedOnOverlap = false;
 
-      for (size_t i = 1; i < areas.size(); i++)
+      // Find the best possible split (and whether it is on the low values or
+      // high values of the bounds).
+      for (size_t i = 1; i < numPossibleSplits; i++)
       {
-        if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
+        // Check bounds.
+        if (overlaps[2 * i + indexOffset] < overlaps[overlapIndex])
         {
           tiedOnOverlap = false;
-          bestAreaIndexOnBestAxis = i;
-          bestOverlapIndexOnBestAxis = i;
+          areaIndex = 2 * i + indexOffset;
+          overlapIndex = 2 * i + indexOffset;
         }
-        else if (overlapedAreas[i] ==
-            overlapedAreas[bestOverlapIndexOnBestAxis])
+        else if (overlaps[i] == overlaps[overlapIndex])
         {
           tiedOnOverlap = true;
-          if (areas[i] < areas[bestAreaIndexOnBestAxis])
-            bestAreaIndexOnBestAxis = i;
+          if (areas[2 * i + indexOffset] < areas[areaIndex])
+            areaIndex = 2 * i + indexOffset;
         }
       }
+
+      bestIndex = ((tiedOnOverlap ? areaIndex : overlapIndex) - indexOffset) / 2
+          + tree->MinNumChildren();
     }
   }
 
-  std::vector<std::pair<ElemType, TreeType*>> sorted(tree->NumChildren());
-  if (lowIsBest)
+  // Get a list of the old children.
+  std::vector<TreeType*> oldChildren(tree->NumChildren());
+  for (size_t i = 0; i < oldChildren.size(); ++i)
+    oldChildren[i] = &tree->Child(i);
+
+  /**
+   * If 'tree' is the root of the tree (i.e. if it has no parent), then we must
+   * create two new child nodes, distribute the children from the original node
+   * among them, and insert those.  If 'tree' is not the root of the tree, then
+   * we may create only one new child node, redistribute the children from the
+   * original node between 'tree' and the new node, then insert those nodes into
+   * the parent.
+   *
+   * Here, we simply set treeOne and treeTwo to the right values to avoid code
+   * duplication.
+   */
+  TreeType* par = tree->Parent();
+  TreeType* treeOne = par ? tree              : new TreeType(tree);
+  TreeType* treeTwo = par ? new TreeType(par) : new TreeType(tree);
+
+  // Now clean the node.
+  tree->numChildren = 0;
+  tree->numDescendants = 0;
+  tree->count = 0;
+  tree->bound.Clear();
+
+  // Assemble vector of values.
+  arma::Col<ElemType> values(oldChildren.size());
+  for (size_t i = 0; i < oldChildren.size(); ++i)
   {
-    for (size_t i = 0; i < sorted.size(); i++)
-    {
-      sorted[i].first = tree->Child(i).Bound()[bestAxis].Lo();
-      sorted[i].second = &tree->Child(i);
-    }
-  }
-  else
-  {
-    for (size_t i = 0; i < sorted.size(); i++)
-    {
-      sorted[i].first = tree->Child(i).Bound()[bestAxis].Hi();
-      sorted[i].second = &tree->Child(i);
-    }
+    values[i] = (lowIsBetter ? oldChildren[i]->Bound()[bestAxis].Lo()
+                             : oldChildren[i]->Bound()[bestAxis].Hi());
   }
+  arma::uvec indices = arma::sort_index(values);
 
-  std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, TreeType*>);
+  for (size_t i = 0; i < bestIndex; ++i)
+    InsertNodeIntoTree(treeOne, oldChildren[indices[i]]);
+  for (size_t i = bestIndex; i < oldChildren.size(); ++i)
+    InsertNodeIntoTree(treeTwo, oldChildren[indices[i]]);
 
-  if (tree->Parent() != NULL)
+  // Insert the new tree node(s).
+  if (par)
   {
-    const size_t oldNumDescendants = tree->NumDescendants();
-//    for (size_t i = 0; i < tree->NumChildren(); ++i)
-//      std::cout << tree->Child(i).NumDescendants() << " ";
-//    std::cout << " (total " << tree->NumDescendants() << ", count " <<
-//tree->count << "\n";
-    const size_t oldNumChildren = tree->NumChildren();
-    tree->numChildren = 0;
-    tree->bound.Clear();
-    tree->count = 0;
-    tree->begin = 0;
-    tree->numDescendants = 0;
-    TreeType* treeTwo = new TreeType(tree->Parent());
-
-    if (tiedOnOverlap)
-    {
-      for (size_t i = 0; i < oldNumChildren; i++)
-      {
-        if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into tree"
-//              << "\n";
-          InsertNodeIntoTree(tree, sorted[i].second);
-        }
-        else
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
-          InsertNodeIntoTree(treeTwo, sorted[i].second);
-        }
-      }
-    }
-    else
-    {
-      for (size_t i = 0; i < oldNumChildren; i++)
-      {
-        if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into tree\n";
-          InsertNodeIntoTree(tree, sorted[i].second);
-        }
-        else
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
-          InsertNodeIntoTree(treeTwo, sorted[i].second);
-        }
-      }
-    }
-
-    // Insert the new node into the tree.
-    TreeType* par = tree->Parent();
+    // Insert the new node into the parent.  The number of descendants does not
+    // need to be updated.
     par->children[par->NumChildren()++] = treeTwo;
-
-    // We only add one at a time, so we should only need to test for equality
-    // just in case, we use an assert.
-    assert(par->NumChildren() <= par->MaxNumChildren() + 1);
-    if (par->NumChildren() == par->MaxNumChildren() + 1)
-      RStarTreeSplit::SplitNonLeafNode(par,relevels);
-
-    // We have to update the children of each of these new nodes so that they
-    // record the correct parent.
-    for (size_t i = 0; i < tree->NumChildren(); i++)
-      tree->children[i]->Parent() = tree;
-
-    for (size_t i = 0; i < treeTwo->NumChildren(); i++)
-      treeTwo->children[i]->Parent() = treeTwo;
-
-//    std::cout << "tree left " << tree->NumDescendants() << ", right " <<
-//treeTwo->NumDescendants() << ", parent " << par->NumDescendants() << "\n";
-//    for (size_t i = 0; i < par->NumChildren(); ++i)
-//      std::cout << par->Child(i).NumDescendants() << " ";
-//    std::cout << " (that's the children of the parent)\n";
-    assert(oldNumDescendants == (tree->NumDescendants() +
-treeTwo->NumDescendants()));
-
-
-    assert(tree->Parent()->NumChildren() <= tree->MaxNumChildren());
-    assert(tree->Parent()->NumChildren() >= tree->MinNumChildren());
-    assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
-    assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
-
-    assert(tree->MaxNumChildren() < 7);
-    assert(treeTwo->MaxNumChildren() < 7);
   }
   else
   {
-    const size_t oldDescendants = tree->NumDescendants();
-//    for (size_t i = 0; i < tree->NumChildren(); ++i)
-//      std::cout << tree->Child(i).NumDescendants() << " ";
-//    std::cout << " (total " << tree->NumDescendants() << ", count " <<
-//tree->count << "\n";
-    TreeType* treeOne = new TreeType(tree);
-    TreeType* treeTwo = new TreeType(tree);
-
-    const size_t oldNumChildren = tree->NumChildren();
-    tree->count = 0;
-    tree->numChildren = 0;
-    tree->bound.Clear();
-    tree->numDescendants = 0;
-
-    if (tiedOnOverlap)
-    {
-      for (size_t i = 0; i < oldNumChildren; i++)
-      {
-        if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeOne\n";
-          InsertNodeIntoTree(treeOne, sorted[i].second);
-        }
-        else
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
-          InsertNodeIntoTree(treeTwo, sorted[i].second);
-        }
-      }
-    }
-    else
-    {
-      for (size_t i = 0; i < oldNumChildren; i++)
-      {
-        if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeOne\n";
-          InsertNodeIntoTree(treeOne, sorted[i].second);
-        }
-        else
-        {
-//          std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
-          InsertNodeIntoTree(treeTwo, sorted[i].second);
-        }
-      }
-    }
-
+    // Insert both nodes into 'tree', which is now a higher-level root node.
     InsertNodeIntoTree(tree, treeOne);
     InsertNodeIntoTree(tree, treeTwo);
 
-//    std::cout << oldDescendants << "; " << treeOne->numDescendants << ", " <<
-//treeTwo->numDescendants << " --> " << tree->numDescendants << "\n";
-    assert(oldDescendants == (treeOne->numDescendants +
-        treeTwo->numDescendants));
-
-    // We have to update the children of each of these new nodes so that they
-    // record the correct parent.
+    // We have to update the children of treeOne so that they record the correct
+    // parent.
     for (size_t i = 0; i < treeOne->NumChildren(); i++)
       treeOne->children[i]->Parent() = treeOne;
-
-    for (size_t i = 0; i < treeTwo->NumChildren(); i++)
-      treeTwo->children[i]->Parent() = treeTwo;
   }
 
+  // Update the children of treeTwo to have the correct parent.
+  for (size_t i = 0; i < treeTwo->NumChildren(); i++)
+    treeTwo->children[i]->Parent() = treeTwo;
+
+  // If we have overflowed hte parent's children, then we need to split that
+  // node also.
+  if (par && par->NumChildren() >= par->MaxNumChildren() + 1)
+    RStarTreeSplit::SplitNonLeafNode(par, relevels);
+
   return false;
 }
 
@@ -776,21 +446,9 @@ treeTwo->NumDescendants()));
 template<typename TreeType>
 void RStarTreeSplit::InsertNodeIntoTree(TreeType* destTree, TreeType* srcNode)
 {
-//  std::cout << "insert " << srcNode << " into " << destTree << "\n";
-
   destTree->Bound() |= srcNode->Bound();
   destTree->numDescendants += srcNode->numDescendants;
   destTree->children[destTree->NumChildren()++] = srcNode;
-
-//  std::cout << "dest now has " << destTree->NumDescendants() << "\n";
-//  size_t manualCount = 0;
-//  for (size_t i = 0; i < destTree->NumChildren(); ++i)
-//  {
-//    manualCount += destTree->Child(i).NumDescendants();
-//    std::cout << destTree->Child(i).NumDescendants() << " ";
-//  }
-//  std::cout << "\n";
-//  assert(manualCount == destTree->NumDescendants());
 }
 
 } // namespace tree

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list