topic Re: Nearest Neighbor Algorithm: Need Optimization Help in Discussions

Nearest Neighbor Algorithm: Need Optimization Help

LaserGuy — Fri, 09 Jun 2023 22:06:01 GMT

Hi Everyone,

I have created the following script. It is meant to analyze a data table where each row corresponds to a chip in a wafer, and each row has a coordinate (x,y) and some parameters. The script will go through each row, look for other rows that correspond to dies that are "neighbors" to this die, and calculate the median and standard deviation of some parameter for that neighborhood.

The issue here is that it while it works, it takes a long time to run when analyzing data sets with a tens of thousands of rows.

I would really appreciate it if anyone can give pointers on how to optimize the script to reduce the computation time.

Code Below:

dt0 = current data table();
	dt0 << new column("NN_median");
	dt0 << new column("NN_stdev");

for ( i = 1, i <= N Rows(dt0), i++,

	//Find rows corresponding to neighboring dies
	m = dt0[0,{"x", "y"}];
	v = ( abs( m[0,1] - :x[i] ) <= 2 )
		&
		( abs( m[0,2] - :y[i] ) <= 2 );
	rows = loc(v);
	
	// Create new DT of neighborhood

	dt1 = dt0 << subset( Rows(rows), Columns(:param), invisible );
	
	//Calculate Median and Std Dev
	tb = Tabulate(
		Add Table(
			Column Table( Statistics( Median, Std Dev ) ),
			Row Table( Analysis Columns( column("param") ) )
		)
	);

	dt2 = tb << make into data table( invisible );
	tb << close window;
	
	current data table(dt2);
	placeholder_median = :Median[1];
	placeholder_stdev = :Std Dev[1];
	
	close(dt2, no save);
	close(dt1, no save);
	
	current data table(dt0);
	:NN_median[i] = placeholder_median;
	:NN_stdev[i] = placeholder_stdev;

);

Re: Nearest Neighbor Algorithm: Need Optimization Help

txnelson — Thu, 11 Feb 2021 08:05:13 GMT

I obtained the identical results in 11 seconds using

dt0 = Current Data Table();
dt0 << New Column( "NN_median" );
dt0 << New Column( "NN_stdev" );
	
m = dt0[0, {"x", "y"}];
For( i = 1, i <= N Rows( dt0 ), i++, 

	//Find rows corresponding to neighboring dies
	
	v = (Abs( m[0, 1] - :x[i] ) <= 2) & (Abs( m[0, 2] - :y[i] ) <= 2);
	rows = Loc( v );
	dt0:NN_stdev[i] = Std Dev( dt0:param[rows] );
	dt0:NN_median[i] = Quantile( .5, dt0:param[rows] );
);

while your code too over 13 minutes

Re: Nearest Neighbor Algorithm: Need Optimization Help

ian_jmp — Thu, 11 Feb 2021 10:59:23 GMT

I didn't do any benchmarking, but might expect that using 'KDTable()' would be good for larger, more general problems:

Names Default To Here( 1 );

n = 100;		// Size of square

// Make a table of x and y locations
dtx = AsTable((1::n)`, << Invisible);
dty = AsTable((1::n)`, << Invisible);
dt = dty << Join( With( dtx ), Cartesian Join );
Close(dtx, NoSave);
Close(dty, NoSave);
Column(dt, 1) << setName("x");
Column(dt, 2) << setName("y");
dt << deleteProperty("Source");
dt << setName("xy Locations");

// Put these locations into a matrix
mat = dt << getAsMatrix;
tbl = KDTable( mat );

// Pick a point in the interior, so we don't need to decide on any boundary conditions
myX = RandomInteger(2, n-1);
myY = RandomInteger(2, n-1);
myRow = Loc(mat[0, 1] == myX & mat[0, 2] == myY);
myRow = myRow[1];

// Find the 8 nearest neighbours to the chosen point
{rows, dist} = tbl << K nearest rows( 8, myRow ); 
Show( myRow, rows );

// Check it worked
myXvals = Column(dt, "x")[myRow];
myYvals = Column(dt, "y")[myRow];
dt << NewColumn("Distance", Numeric, Continuous, Formula(sqrt((:x - myXvals)^2 +(:y - myYvals)^2)));
dt << selectRows(VConcat(myRow, rows`));
gb = dt << Graph Builder(
						Size( 529, 453 ),
						Show Control Panel( 0 ),
						Variables( X( :x ), Y( :y ), Color( :Distance ) ),
						Elements( Points( X, Y, Legend( 5 ) ) )
					);

Re: Nearest Neighbor Algorithm: Need Optimization Help

LaserGuy — Thu, 11 Feb 2021 17:33:37 GMT

Thanks! I am learning JMP script syntaxes as I go along. Your code certainly bypasses quite a few unnecessary operations.

Re: Nearest Neighbor Algorithm: Need Optimization Help

hogi — Fri, 28 Jun 2024 20:50:10 GMT

Let's take the sample data set *) and do some benchmarking

- let's start with an example how you should NOT do it ...

*) actual use cases are ~ 500 x 500 dies and hundreds of wafers.

Names Default To Here( 1 );

dt0 = Open( "$SAMPLE_DATA/Wafer Stacked.jmp" );

wait(0);
t0 = hptime();

dt0 << New Column( "row", set each value( Row() ) );

max dist = 1; // just keep the direct neighbors	

// create a lookup table
dtx = As Table( (-21 :: 21)`, <<Invisible );
dty = As Table( (-21 :: 21)`, <<Invisible );
dtLookup = dty << Join( With( dtx ), Cartesian Join, Output Table( "xyLocations" ) );
Close( dtx, NoSave );
Close( dty, NoSave );
Column( dtLookup, 1 ) << setName( "x" );
Column( dtLookup, 2 ) << setName( "y" );

// create the KDTable 
mat = dtLookup << getAsMatrix;
tbl = KDTable( mat );
dtLookup << New Column( "row_lookup", set each value( Row() ) );

row()=50;
// add column with neighboring rows
Eval (Eval Expr(dtLookup << New Column( "neighbors",
	Expression,
	set each value(
		{neighbors, dist} = tbl << K nearest rows( 8, Row() );
		neighbors[Where( dist <= Expr(max dist) )];
	)
)));
	
// map the neighbors to the main table
dt0 << Update(
	With( dtLookup ),
	Match Columns( :X_Die = :x, :Y_Die = :y ),
	Add Columns from Update Table( :row_lookup, :neighbors ),
	Replace Columns in Main Table( None )
);

wait(0);
t1=hptime();

// split the table to find the rows faster
// to speed things up, one could do the neighbours averaging here, but that's not the target
mySubsets = dt0 << Subset( By( :Lot, :Wafer ), All rows, Selected columns only( 0 ), Private( 1 ) );

// why is this SOOOOO slow !??!
For Each( {myTable}, mySubsets,
	myTable:neighbors << set each value(
		input = :neighbors; // take the neighbors from the lookup table 
		Transform Each( {myRow}, input, // and replace them with the neighbors of the actual wafer
			result = :row[Where( myTable, :row_lookup == myRow )];
			If( N Items( result ),
				result[1],
				.
			);
		);
	)
);

//merge the subsets and map the neighbors back to the main table 
tmp = mySubsets[1] << Concatenate( mySubsets[2 :: N Items( mySubsets )], Private() );
For Each( {myTable}, mySubsets, Close( myTable, noSave ) );
dt0 << Update(
	With( tmp ),
	Match Columns( :Lot = :Lot, :Wafer = :Wafer, :X_Die = :X_Die, :Y_Die = :Y_Die ),
	Add Columns from Update Table( None ),
	Replace Columns in Main Table( :neighbors )
);
Close( tmp, noSave );

wait(0);
t2= hptime();
// now use the cool new column 
dt0 << New Column( "Defects average", Formula( Mean( :Defects[:neighbors] ) ) );

t3=hptime();

print("create lookup", (t1-t0)/1000000);
print("correct the rows", (t2-t1)/1000000);
print("calculate neighbor average", (t3-t2)/1000000);

Re: Nearest Neighbor Algorithm: Need Optimization Help

hogi — Fri, 28 Jun 2024 20:45:01 GMT

Names Default To Here( 1 );

dt0 = Open( "$SAMPLE_DATA/Wafer Stacked.jmp" );
wait(0);
	

t0 = hptime();
// get the coordinates
dtLookup = dt0 << Summary(Group( :X_Die, :Y_Die ),output table name( "xyLocations" ), Private);
dtLookup << Delete Columns( :N Rows );

dt0 << New Column( "row", set each value( Row() ) );

max dist = 1; // just keep the direct neighbors	

// create the KDTable 
mat = dtLookup << getAsMatrix;
tbl = KDTable( mat );
dtLookup << New Column( "row_lookup", set each value( Row() ) );

// add column with neighboring rows
Eval (Eval Expr(dtLookup << New Column( "neighbors",Vector,
	Expression,
	set each value(
		{neighbors, dist} = tbl << K nearest rows( 100, Row() );
		neighbors[Where( dist <= Expr(max dist) )];
	)
)));
	
// map the neighbors to the main table
dt0 << Update(
	With( dtLookup ),
	Match Columns( :X_Die = :X_Die, :Y_Die = :Y_Die ),
	Add Columns from Update Table( :row_lookup, :neighbors ),
	Replace Columns in Main Table( None )
);
Close(dtLookup, noSave);
wait(0);
t1=hptime();

//map the rows from the lookup table to the correct rows

// split the table to find the rows faster
// to speed things up, one could do the neighbours averaging here, but that's not the target
mySubsets = dt0 << Subset( By( :Lot, :Wafer ), All rows, Selected columns only( 0 ), Private( 1 ) );

For Each( {myTable}, mySubsets,

// so complicated ?!?!
dthelp= new table("help", add rows(col maximum(myTable:row_lookup)), new column("row_lookup",set each value(row())), private(1));
dthelp << Update(
	With( myTable ),
	Match Columns( :row_lookup = :row_lookup ),
	Add Columns from Update Table( :row ),
	Replace Columns in Main Table( None )
);
	myTable:neighbors << set each value(
		input = :neighbors; // take the neighbors from the lookup table 
		Transform each({row},input, dthelp[row,"row"]);
		);
	close(dthelp, noSave)
);

//merge the subsets and map the neighbors back to the main table 
tmp = mySubsets[1] << Concatenate( mySubsets[2 :: N Items( mySubsets )], Private() );
For Each( {myTable}, mySubsets, Close( myTable, noSave ) );
dt0 << Delete Columns( :neighbors );
dt0 << Update(
	With( tmp ),
	Match Columns( :Lot = :Lot, :Wafer = :Wafer, :X_Die = :X_Die, :Y_Die = :Y_Die ),
	Add Columns from Update Table( :neighbors ),
	Replace Columns in Main Table( None )
);
Close( tmp, noSave );

wait(0);
t2= hptime();

// now use the cool new column 
dt0 << New Column( "Defects average", Formula( Mean( :Defects[:neighbors] ) ) );
wait(0);
t3=hptime();

print("create lookup", (t1-t0)/1000000);
print("correct the rows", (t2-t1)/1000000);
print("calculate neighbor average", (t3-t2)/1000000);