Solved: Randomly Subselect into Groups and get mean

MikeDereviankin · Jun 9, 2023 10:02 AM

Hi,

I have a dataset of ~2000 individuals stratified by 3 categories (gender/age/ethnicity). I want to get, "pooled", values for these individuals by taking ~8 individuals from each gender/age/ethnicity and finding the average. A caveat is that there are more individuals in certain category bins than the others. Attached is a breakdown of all the individuals I got and their grouping. For instance, category 1-1-1 has 120 individuals, so I could create 8*15 groups of random individuals from these. What if I have a number not a multiple of 8?

How can I do this all in one step? In the end we'd have around 200 pools.

M. Dereviankin

txnelson · Jul 1, 2022 12:35 PM

Below is a script that works on your sample data table, creating all of the Pools you want. Please take the time to study the script so you can make any modifications you may need to adjust it to it's final form

Names Default To Here( 1 );
dt = Current Data Table();

// Set the size of the pools
poolSize = 8;
thePoolNumber = 0;

// Add row number to data table
dt << New Column( "__Row__", Set Each Value( Row() ) );

// Create a working data table
dt2 = dt << subset(
	selected rows( 0 ),
	columns( {"Gender", "Ethnicity", "Age", "__Row__"} ),
	output table( "Working" )
);

// Create start of final data table
dt2 << select rows( 1 );
dtPool = dt2 << subset( selected rows( 1 ), selected columns( 0 ), output table( "Final" ) );
dtPool << select rows( 1 );
dtPool << delete rows;
dtPool << New Column( "Pool Number" );
thePoolNumber = 0;

// Create data pools
While( N Rows( dt2 ) >= poolSize, 

	// Draw a new sample
	dtTemp = dt2 << Subset(
		Sample Size( poolSize ),
		Selected columns only( 0 ),
		Stratify( :Gender, :Ethnicity, :Age )
	);

	// Calculate the new pool numbers
	dtPoolCount = dtTemp << Summary(
		Group( :Gender, :Ethnicity, :Age ),
		Freq( "None" ),
		Weight( "None" ),
		Link to original data table( 0 )
	);
	
	// Get the current pool number
	maxPool = Col Max( dtPool:Pool Number );
	If( Is Missing( maxPool ),
		maxPool = 0
	);

	// Add Pool Number to Pool Count table
	dtPoolCount << New Column( "Pool Number", set each value( maxPool + Row() ) );
	dtPoolCount << delete columns( N Rows );

	// Set the current pool number in the current sample table
	dtTemp = dtTemp << Update(
		With( dtPoolCount ),
		Match Columns( :Gender = :Gender, :Ethnicity = :Ethnicity, :Age = :Age )
	);

	// Add the sample to the dtPool table
	dtPool = dtPool << concatenate( dtTemp, append to first table( 1 ) );

	// Delete the selected rows from the working table
	dt2 = dt2 << Update( With( dtTemp ), Match Columns( :__row__ = :__row__ ) );
	
	// 
	dt2 << select where( Is Missing( :Pool Number ) == 0 );
	dt2 << delete rows;
	dt2 << delete columns( :Pool Number );

// Cleanup temporary tables
	Close( dtTemp, nosave );
	Close( dtPoolCount, nosave );
);

If( N Rows( dt2 ) > 0, 
// Calculate the new pool numbers
	dtPoolCount = dt2 << Summary(
		Group( :Gender, :Ethnicity, :Age ),
		Freq( "None" ),
		Weight( "None" ),
		Link to original data table( 0 )
	);
	
	// Get the current pool number
	maxPool = Col Max( dtPool:Pool Number );
	If( Is Missing( maxPool ),
		maxPool = 0
	);

	// Add Pool Number to Pool Count table
	dtPoolCount << New Column( "Pool Number", set each value( maxPool + Row() ) );
	dtPoolCount << delete columns( N Rows );

	// Set the current pool number in the current sample table
	dt2 = dt2 << Update(
		With( dtPoolCount ),
		Match Columns( :Gender = :Gender, :Ethnicity = :Ethnicity, :Age = :Age )
	);
	
	// Add the sample to the dtPool table
	dtPool = dtPool << concatenate( dt2, append to first table( 1 ) );

	// Clean up
	Close( dtPoolCount, nosave );
);
// Add the Pool Number to the original data table
dt << Update( With( dtPool ), Match Columns( :__Row__ = :__Row__ ) );

// Delete the Working data table and the Pool Count data table
Close( dt2, nosave );
Close( dtPool, nosave );

Jim

View solution in original post

txnelson · Jun 30, 2022 12:31 AM

I am not totally sure I have created a script that does what you want, but it may give you a good start. Here is what it does

It finds the gender/age/ethnicity group that has the smallest population. This group will be the limiter on how many pools can be created since no rows can appear in more than one pool.
It then tries to find the pool size that minimized the number of non-included rows for that sub group. It does this by walking up the group size from 8 to 21, and finds the smallest remainder(waste).
It then creates a working table from the original table.
It loops through the working table,
1. Creates a stratified random sample
2. Adds the new sample to the final pool data table
3. Deletes from the working data table, the rows that were selected for the latest subsetted random data table.
4. Repeats as many times as possible to get as many of the smallest subgroup records as possible into the final pooled data table.

Here is the JSL

Names Default To Here( 1 );
dt = Current Data Table();

// Find smallest subgroup
Summarize( dt, subGroup = by( :age, :gender, :ethnicity ), size = Count( :age ) );
smallest = Min( size );

// Find best group size
minLost = 999999999;
minLostSize = .;
For( i = 8, i <= 21, i++,
	If( Mod( smallest, i ) < minLost,
		minLost = Mod( smallest, i );
		minLostSize = i;
	)
);

// Create a working data table
dt2 = dt << subset( selected rows( 0 ), selected columns( 0 ), output table( "Working" ) );
dt2 << New Column( "__row__", set each value( Row() ) );

// Create start of final data table
dt2 << select rows( 1 );
dtPool = dt2 << subset( selected rows( 1 ), selected columns( 0 ), output table( "Final" ) );
dtPool << select rows( 1 );
dtPool << delete rows;
dtPool << New Column( "Pool Number" );
thePoolNumber = 0;

// Create data pools
For( i = 1, i <= Floor( smallest / minLostSize ), i++,
	dtTemp = dt2 << Subset(
		Sample Size( minLostSize ),
		Selected columns only( 0 ),
		Stratify( :ethnicity, :Age, :Gender )
	);
	thePoolNumber = thePoolNumber + 1;
	dtTemp << New Column( "Pool Number", set each value( thePoolNumber ) );

	dtPool = dtPool << concatenate( dtTemp, append to first table( 1 ) );

// Remove selected rows from working table
	dtRemove = dtTemp << subset( selected rows( 0 ), columns( __row__ ) );
	dtRemove << New Column( "Delete", character, set each value( "Yes" ) );

	dt2 = dt2 << Update( With( dtRemove ), Match Columns( :__row__ = :__row__ ) );
	dt2 << select where( :Delete == "Yes" );
	dt2 << delete rows;
	dt2 << delete columns( :delete );

// Cleanup temporary tables
	Close( dtTemp, nosave );
	Close( dtRemove, nosave );
);

// Delete the Working data table
Close( dt2, nosave );

I tested this on a sample data table I created so it may have some issues with your actual table. I have annotated the code, so hopefully you will be able to learn what I have done, and be able to make adjustments to get it to work exactly as you need it.

Jim

MikeDereviankin · Jun 30, 2022 09:58 AM

Hi Jim,

This script results in a blank table. What I'm hoping to get out of this is a new column in the original data table called, "Pool Number". One thing to note is that age is not a continuous numeric variable. It's a categorical variable between numbers 1 - 4.

Once, I have all the samples in the original dataset categorized within their pool, I can calculate the mean pool value by taking the mean of the ~8 samples in the pool.

I've attached the file as a reference.

The working table makes sense in the script; you're creating a column with each row having a unique value. The creation of the final dataset leads to a blank datasheet and I do not see where this error is arising from.

M. Dereviankin

txnelson · Jun 30, 2022 03:55 PM

My JSL completes successfully if the potential pool size is reduced to 6. My code requires that at least 1 sample has to be able to be pulled from each sub group, and since subgroup, 1 2 4 only has 6 observations(rows), the code I wrote to find the best sample size had to be modified to have a pool size of 6.

But regardless of that, I am apparently not understanding how you are determining your pools. I was equating your pool, to a random sample, with a given sample size, for each subgroup of data. This turns out to be a false assumption on my part.

Are you creating pools independently for each subgroup? That is, for subgroup 1 1 1, there are 120 rows of data. If the pool size is 8, do you create15 pools? And then for group 1 1 2 which has 50 row, you would create 6 pools, with 4 rows not being selected for a pool?

Is this how you creat your pools?

Here is the break down of all of the Gender/ethnicity/age subgroups.

My current JSL does not do what you want, Please give me a more precise explanation of how to create pools, and I will see what needs to be done to put together a workable script.

Jim

MikeDereviankin · Jul 1, 2022 07:10 AM

Hi Jim,

The subgroups are created by having ~ 8 individuals in each pool. You are correct regarding 1-1-1. This group has 120 individuals, so if we were to pool these individuals randomly they should fit into exactly 15 pools. For group 1-1-2, which has 50 rows, you would create 6 pools of 8 individuals and the remaining 4 would be put into their own pool.

Does that make sense? We want all rows in the data table to be associated with a specific pool number.

M. Dereviankin

txnelson · Jul 1, 2022 12:35 PM

Below is a script that works on your sample data table, creating all of the Pools you want. Please take the time to study the script so you can make any modifications you may need to adjust it to it's final form

Names Default To Here( 1 );
dt = Current Data Table();

// Set the size of the pools
poolSize = 8;
thePoolNumber = 0;

// Add row number to data table
dt << New Column( "__Row__", Set Each Value( Row() ) );

// Create a working data table
dt2 = dt << subset(
	selected rows( 0 ),
	columns( {"Gender", "Ethnicity", "Age", "__Row__"} ),
	output table( "Working" )
);

// Create start of final data table
dt2 << select rows( 1 );
dtPool = dt2 << subset( selected rows( 1 ), selected columns( 0 ), output table( "Final" ) );
dtPool << select rows( 1 );
dtPool << delete rows;
dtPool << New Column( "Pool Number" );
thePoolNumber = 0;

// Create data pools
While( N Rows( dt2 ) >= poolSize, 

	// Draw a new sample
	dtTemp = dt2 << Subset(
		Sample Size( poolSize ),
		Selected columns only( 0 ),
		Stratify( :Gender, :Ethnicity, :Age )
	);

	// Calculate the new pool numbers
	dtPoolCount = dtTemp << Summary(
		Group( :Gender, :Ethnicity, :Age ),
		Freq( "None" ),
		Weight( "None" ),
		Link to original data table( 0 )
	);
	
	// Get the current pool number
	maxPool = Col Max( dtPool:Pool Number );
	If( Is Missing( maxPool ),
		maxPool = 0
	);

	// Add Pool Number to Pool Count table
	dtPoolCount << New Column( "Pool Number", set each value( maxPool + Row() ) );
	dtPoolCount << delete columns( N Rows );

	// Set the current pool number in the current sample table
	dtTemp = dtTemp << Update(
		With( dtPoolCount ),
		Match Columns( :Gender = :Gender, :Ethnicity = :Ethnicity, :Age = :Age )
	);

	// Add the sample to the dtPool table
	dtPool = dtPool << concatenate( dtTemp, append to first table( 1 ) );

	// Delete the selected rows from the working table
	dt2 = dt2 << Update( With( dtTemp ), Match Columns( :__row__ = :__row__ ) );
	
	// 
	dt2 << select where( Is Missing( :Pool Number ) == 0 );
	dt2 << delete rows;
	dt2 << delete columns( :Pool Number );

// Cleanup temporary tables
	Close( dtTemp, nosave );
	Close( dtPoolCount, nosave );
);

If( N Rows( dt2 ) > 0, 
// Calculate the new pool numbers
	dtPoolCount = dt2 << Summary(
		Group( :Gender, :Ethnicity, :Age ),
		Freq( "None" ),
		Weight( "None" ),
		Link to original data table( 0 )
	);
	
	// Get the current pool number
	maxPool = Col Max( dtPool:Pool Number );
	If( Is Missing( maxPool ),
		maxPool = 0
	);

	// Add Pool Number to Pool Count table
	dtPoolCount << New Column( "Pool Number", set each value( maxPool + Row() ) );
	dtPoolCount << delete columns( N Rows );

	// Set the current pool number in the current sample table
	dt2 = dt2 << Update(
		With( dtPoolCount ),
		Match Columns( :Gender = :Gender, :Ethnicity = :Ethnicity, :Age = :Age )
	);
	
	// Add the sample to the dtPool table
	dtPool = dtPool << concatenate( dt2, append to first table( 1 ) );

	// Clean up
	Close( dtPoolCount, nosave );
);
// Add the Pool Number to the original data table
dt << Update( With( dtPool ), Match Columns( :__Row__ = :__Row__ ) );

// Delete the Working data table and the Pool Count data table
Close( dt2, nosave );
Close( dtPool, nosave );

Jim

Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean

Re: Randomly Subselect into Groups and get mean