Subscribe Bookmark RSS Feed

how to replace empty values of numeric columns with the average column value

Highlighted
gianpaolo_polsi

Community Trekker

Joined:

Mar 23, 2016

good morning,

someone can help me? ...

i would like to replace empty values of numeric columns with the average column value.

 

i tried this

dt = Current Data Table();

FILL = dt << get column names( Numeric );

For( i = 1, i <= N Items( FILL ), i++,

FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = col mean(FILL[i]);

);

 

thanks in advance

Gianpaolo

 

2 ACCEPTED SOLUTIONS

Accepted Solutions
ian_jmp

Staff

Joined:

Jun 23, 2011

Solution

Please find one way to do this:

NamesDefaultToHere(1);

// Make some test data
dt = NewTable("Test", NewColumn("Data", Numeric, Continuous, Formula(if(Mod(Row(), 2) == 0, RandomNormal()))));
dt << addRows(20);
dt << runFormulas;
Column(dt, "Data") << deleteFormula;

// Impute missing cells with the mean of the others
Wait(3);
col = Column(dt, "Data");
values = col << getValues;
values[Loc(IsMissing(values))] = Mean(values);
col << setValues(values);

Probably it's good practice to delineate the values that were imputed by colouring their cells. 

ian_jmp

Staff

Joined:

Jun 23, 2011

Solution

I didn't study your code in detail, but had the sense that you are doing more work than you need to. To make things more readable, you could consider using functions as below:

NamesDefaultToHere(1);

// Only impute if more than n cells are missing
imputeContinuousCol =
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(IsMissing(values));
	if(NRow(missingValuePos) > n,
		values[Loc(IsMissing(values))] = Mean(values);
		col << setValues(values);
		);
);

// Only impute if more than n cells are missing
imputeCharacterCol = 
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(values, "");
	if(NRow(missingValuePos) > n,
		values[Loc(values, "")] = "NA";
		col << setValues(values);
		);
);

// Data table . . .
dt = DataTable("Big Class.jmp");
// List of column names that satidfy your imputation criteria . . .
imputeList = {"name", "sex", "height"};
// Loop over this list, and impute if necessary
for (c=1, c<=NItems(imputeList), c++,
	col = Column(dt, imputeList[c]);
	if(
		(col << getModelingType) == "Continuous",
			imputeContinuousCol(col, 1),
		(col << getDataType) == "Character",
			imputeCharacterCol(col, 1),
	);
);

Open 'Big Class', make some cells missing in the listed columns, then try it out. 

3 REPLIES
ian_jmp

Staff

Joined:

Jun 23, 2011

Solution

Please find one way to do this:

NamesDefaultToHere(1);

// Make some test data
dt = NewTable("Test", NewColumn("Data", Numeric, Continuous, Formula(if(Mod(Row(), 2) == 0, RandomNormal()))));
dt << addRows(20);
dt << runFormulas;
Column(dt, "Data") << deleteFormula;

// Impute missing cells with the mean of the others
Wait(3);
col = Column(dt, "Data");
values = col << getValues;
values[Loc(IsMissing(values))] = Mean(values);
col << setValues(values);

Probably it's good practice to delineate the values that were imputed by colouring their cells. 

gianpaolo_polsi

Community Trekker

Joined:

Mar 23, 2016

hi Ian
first of all thanks for your help and suggestions. your script work very well
So.... I tried to integrate your script into a more big one, in order to reduce and simply DB.

here the entire work
Names Default To Here( 1 );
clear globals();



// code to choose among all opened tables which one will be selected for the analysis
OpenDT = List();
For( i = 1, i <= N Table(), i++,
Insert Into( OpenDT, Data Table( i ) )
);


dlgDT = Dialog( "Select Data Table for analysis", VList( selectDT = List Box( OpenDT ) ), HList( Button( "OK" ), Button( "Cancel" ) ) );

Show( dlgDT );
If( dlgDT["button"] == 1,
Remove From( dlgDT, N Items( dlgDT ) );
Eval List( dlgDT );
);



Current Data Table( selectDT[1] );



// -----------------------------------------GUI -----------------------------------------//
cd = Column Dialog(
Title( " Data Preprocessing " ),
List= ColList( " Select Columns >> ", MinCol( 1 ) ),
HList( " % of Empty elements in the column -> (0-100) ", mypercent1 = EditNumber( 50 ) ),
HList( " % of Stagnant elements in the column -> (0-100) ", mypercent = EditNumber( 80 ) ),
VList(
numeric = Check Box( "Columns Numeric - Impute missing cells with the mean of the others", 1 ),
character = Check Box( "Columns character - Impute missing cells with N/A", 1 ),

),
);



//declaration of variables

impute_num_col = cd["numeric"]; // impute numerics
impute_char = cd["character"]; // impute characters

percent = cd["mypercent"];
percent1 = cd["mypercent1"];

mylist = cd["List"];

dt= selectDT[1];
delcols = {};


numRows = nrows(dt);
for(i=1; maxi=Nitems(mylist), i<=maxi, i++,
if(nrows(loc(aslist(column(dt,mylist[i])<<getasmatrix),mode(column(dt,mylist[i])<<getasmatrix))) >= numRows * percent/100,
insert into(delcols, myList[i]);
,
if(Col N Missing(column(dt,mylist[i])) >= numRows * percent1/100,
insert into(delcols, myList[i]);
);
);
);


nitems = N Items( delcols );
colname = {};
colnamelist = {};
For( p = 1, p <= nitems, p++,
colname = Column( delcols[p] ) << Get Name;
Insert Into( colnamelist, colname(p) );
);


dt << delete columns (colnamelist);








//OK! Columns Numeric - Impute missing cells with the mean of the others
If( impute_num_col == 1,

cc = dt << Get Column Names( Continuous );
ncols = N Items( cc );


For( i = 1, i <= ncols, i++,
values = Column( dt, cc[i] ) << get values( 1 );
values[Loc( Is Missing( values ) )] = Mean( values );
Column( dt, cc[i] ) << setvalues( values, cc[i] );


);
);




// Columns character - Impute missing cells with NA
If( impute_char == 1,
FILL = dt << get column names( Character );
For( i = 1, i <= N Items( FILL ), i++,
FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = "NA";
);

);

but the problem is that the last part of script (#Columns character - Impute missing cells with NA#)
don't know why... doesn't work....
very strange because the same code works if isn't integrate in the job above.

dt = Current Data Table();
FILL = dt << get column names( Character );
For( i = 1, i <= N Items( FILL ), i++,
FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = "NA";
);


do you have idea why it is happening... ?


thanks for collaboration
ian_jmp

Staff

Joined:

Jun 23, 2011

Solution

I didn't study your code in detail, but had the sense that you are doing more work than you need to. To make things more readable, you could consider using functions as below:

NamesDefaultToHere(1);

// Only impute if more than n cells are missing
imputeContinuousCol =
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(IsMissing(values));
	if(NRow(missingValuePos) > n,
		values[Loc(IsMissing(values))] = Mean(values);
		col << setValues(values);
		);
);

// Only impute if more than n cells are missing
imputeCharacterCol = 
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(values, "");
	if(NRow(missingValuePos) > n,
		values[Loc(values, "")] = "NA";
		col << setValues(values);
		);
);

// Data table . . .
dt = DataTable("Big Class.jmp");
// List of column names that satidfy your imputation criteria . . .
imputeList = {"name", "sex", "height"};
// Loop over this list, and impute if necessary
for (c=1, c<=NItems(imputeList), c++,
	col = Column(dt, imputeList[c]);
	if(
		(col << getModelingType) == "Continuous",
			imputeContinuousCol(col, 1),
		(col << getDataType) == "Character",
			imputeCharacterCol(col, 1),
	);
);

Open 'Big Class', make some cells missing in the listed columns, then try it out.