The approach that @ian_jmp found was the first thing I tried, but I abandoned it too early apparently as Ian found a way to overcome some shortcomings that I ran into. Nice job.
As for eliminating resampling of rows, I believe that Ian was suggesting adding the line:
bv[Loc Min( dist )] = .;
to the for() loop. Thusly:
Names Default To Here( 1 );
// Make a big table
dt1 = New Table( "Big",
New Column( "Values", Numeric, Continuous, Formula( Random Uniform( -3, 3 ) ) ),
New Column("Row", Formula(Row()))
);
dt1 << addRows( 40000 );
dt1 << runFormulas;
Column( dt1, "Values" ) << deleteFormula;
// Make a small table
dt2 = New Table( "Small",
New Column( "Values", Numeric, Continuous, Formula( Random Normal( 0, 1 ) ) )
);
dt2 << addRows( 40 );
dt2 << runFormulas;
Column( dt2, "Values" ) << deleteFormula;
// For each value in dt2, find a value in dt1 that's closest
bv = Column( dt1, "Values" ) << getValues;
sv = Column( dt2, "Values" ) << getValues;
selectedRows = J( N Row( sv ), 1, . );
For( i = 1, i <= N Row( sv ), i++,
dist = (bv - sv[i]) ^ 2;
selectedRows[i] = Loc Min( dist );
//the line below prevents rows from the "Big" table from being re-sampled
bv[Loc Min( dist )] = .;
);
// Subset dt1
dt3 = dt1 << subset( rows( selectedRows ), LinkToOriginalDataTable( 1 ) );
// Look at the distributions
New Window( "Compare Distributions",
H List Box(
dt1 << Distribution( Continuous Distribution( Column( :Values ) ) ),
dt2 << Distribution( Continuous Distribution( Column( :Values ) ) ),
dt3 << Distribution( Continuous Distribution( Column( :Values ) ) )
)
);
-Jeff