cancel
Showing results for 
Show  only  | Search instead for 
Did you mean: 
JMP is taking Discovery online, April 16 and 18. Register today and join us for interactive sessions featuring popular presentation topics, networking, and discussions with the experts.
Choose Language Hide Translation Bar
gianpaolo
Level IV

how to replace empty values of numeric columns with the average column value

good morning,

someone can help me? ...

i would like to replace empty values of numeric columns with the average column value.

 

i tried this

dt = Current Data Table();

FILL = dt << get column names( Numeric );

For( i = 1, i <= N Items( FILL ), i++,

FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = col mean(FILL[i]);

);

 

thanks in advance

Gianpaolo

 

Gianpaolo Polsinelli
2 ACCEPTED SOLUTIONS

Accepted Solutions
ian_jmp
Staff

Re: how to replace empty values of numeric columns with the average column value

Please find one way to do this:

NamesDefaultToHere(1);

// Make some test data
dt = NewTable("Test", NewColumn("Data", Numeric, Continuous, Formula(if(Mod(Row(), 2) == 0, RandomNormal()))));
dt << addRows(20);
dt << runFormulas;
Column(dt, "Data") << deleteFormula;

// Impute missing cells with the mean of the others
Wait(3);
col = Column(dt, "Data");
values = col << getValues;
values[Loc(IsMissing(values))] = Mean(values);
col << setValues(values);

Probably it's good practice to delineate the values that were imputed by colouring their cells. 

View solution in original post

ian_jmp
Staff

Re: how to replace empty values of numeric columns with the average column value

I didn't study your code in detail, but had the sense that you are doing more work than you need to. To make things more readable, you could consider using functions as below:

NamesDefaultToHere(1);

// Only impute if more than n cells are missing
imputeContinuousCol =
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(IsMissing(values));
	if(NRow(missingValuePos) > n,
		values[Loc(IsMissing(values))] = Mean(values);
		col << setValues(values);
		);
);

// Only impute if more than n cells are missing
imputeCharacterCol = 
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(values, "");
	if(NRow(missingValuePos) > n,
		values[Loc(values, "")] = "NA";
		col << setValues(values);
		);
);

// Data table . . .
dt = DataTable("Big Class.jmp");
// List of column names that satidfy your imputation criteria . . .
imputeList = {"name", "sex", "height"};
// Loop over this list, and impute if necessary
for (c=1, c<=NItems(imputeList), c++,
	col = Column(dt, imputeList[c]);
	if(
		(col << getModelingType) == "Continuous",
			imputeContinuousCol(col, 1),
		(col << getDataType) == "Character",
			imputeCharacterCol(col, 1),
	);
);

Open 'Big Class', make some cells missing in the listed columns, then try it out. 

View solution in original post

3 REPLIES 3
ian_jmp
Staff

Re: how to replace empty values of numeric columns with the average column value

Please find one way to do this:

NamesDefaultToHere(1);

// Make some test data
dt = NewTable("Test", NewColumn("Data", Numeric, Continuous, Formula(if(Mod(Row(), 2) == 0, RandomNormal()))));
dt << addRows(20);
dt << runFormulas;
Column(dt, "Data") << deleteFormula;

// Impute missing cells with the mean of the others
Wait(3);
col = Column(dt, "Data");
values = col << getValues;
values[Loc(IsMissing(values))] = Mean(values);
col << setValues(values);

Probably it's good practice to delineate the values that were imputed by colouring their cells. 

gianpaolo
Level IV

Re: how to replace empty values of numeric columns with the average column value

hi Ian
first of all thanks for your help and suggestions. your script work very well
So.... I tried to integrate your script into a more big one, in order to reduce and simply DB.

here the entire work
Names Default To Here( 1 );
clear globals();



// code to choose among all opened tables which one will be selected for the analysis
OpenDT = List();
For( i = 1, i <= N Table(), i++,
Insert Into( OpenDT, Data Table( i ) )
);


dlgDT = Dialog( "Select Data Table for analysis", VList( selectDT = List Box( OpenDT ) ), HList( Button( "OK" ), Button( "Cancel" ) ) );

Show( dlgDT );
If( dlgDT["button"] == 1,
Remove From( dlgDT, N Items( dlgDT ) );
Eval List( dlgDT );
);



Current Data Table( selectDT[1] );



// -----------------------------------------GUI -----------------------------------------//
cd = Column Dialog(
Title( " Data Preprocessing " ),
List= ColList( " Select Columns >> ", MinCol( 1 ) ),
HList( " % of Empty elements in the column -> (0-100) ", mypercent1 = EditNumber( 50 ) ),
HList( " % of Stagnant elements in the column -> (0-100) ", mypercent = EditNumber( 80 ) ),
VList(
numeric = Check Box( "Columns Numeric - Impute missing cells with the mean of the others", 1 ),
character = Check Box( "Columns character - Impute missing cells with N/A", 1 ),

),
);



//declaration of variables

impute_num_col = cd["numeric"]; // impute numerics
impute_char = cd["character"]; // impute characters

percent = cd["mypercent"];
percent1 = cd["mypercent1"];

mylist = cd["List"];

dt= selectDT[1];
delcols = {};


numRows = nrows(dt);
for(i=1; maxi=Nitems(mylist), i<=maxi, i++,
if(nrows(loc(aslist(column(dt,mylist[i])<<getasmatrix),mode(column(dt,mylist[i])<<getasmatrix))) >= numRows * percent/100,
insert into(delcols, myList[i]);
,
if(Col N Missing(column(dt,mylist[i])) >= numRows * percent1/100,
insert into(delcols, myList[i]);
);
);
);


nitems = N Items( delcols );
colname = {};
colnamelist = {};
For( p = 1, p <= nitems, p++,
colname = Column( delcols[p] ) << Get Name;
Insert Into( colnamelist, colname(p) );
);


dt << delete columns (colnamelist);








//OK! Columns Numeric - Impute missing cells with the mean of the others
If( impute_num_col == 1,

cc = dt << Get Column Names( Continuous );
ncols = N Items( cc );


For( i = 1, i <= ncols, i++,
values = Column( dt, cc[i] ) << get values( 1 );
values[Loc( Is Missing( values ) )] = Mean( values );
Column( dt, cc[i] ) << setvalues( values, cc[i] );


);
);




// Columns character - Impute missing cells with NA
If( impute_char == 1,
FILL = dt << get column names( Character );
For( i = 1, i <= N Items( FILL ), i++,
FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = "NA";
);

);

but the problem is that the last part of script (#Columns character - Impute missing cells with NA#)
don't know why... doesn't work....
very strange because the same code works if isn't integrate in the job above.

dt = Current Data Table();
FILL = dt << get column names( Character );
For( i = 1, i <= N Items( FILL ), i++,
FILL[i][dt << get rows where( Is Missing( FILL[i][] ) )] = "NA";
);


do you have idea why it is happening... ?


thanks for collaboration
Gianpaolo Polsinelli
ian_jmp
Staff

Re: how to replace empty values of numeric columns with the average column value

I didn't study your code in detail, but had the sense that you are doing more work than you need to. To make things more readable, you could consider using functions as below:

NamesDefaultToHere(1);

// Only impute if more than n cells are missing
imputeContinuousCol =
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(IsMissing(values));
	if(NRow(missingValuePos) > n,
		values[Loc(IsMissing(values))] = Mean(values);
		col << setValues(values);
		);
);

// Only impute if more than n cells are missing
imputeCharacterCol = 
Function({col, n}, {Default Local},
	values = col << getValues;
	missingValuePos = Loc(values, "");
	if(NRow(missingValuePos) > n,
		values[Loc(values, "")] = "NA";
		col << setValues(values);
		);
);

// Data table . . .
dt = DataTable("Big Class.jmp");
// List of column names that satidfy your imputation criteria . . .
imputeList = {"name", "sex", "height"};
// Loop over this list, and impute if necessary
for (c=1, c<=NItems(imputeList), c++,
	col = Column(dt, imputeList[c]);
	if(
		(col << getModelingType) == "Continuous",
			imputeContinuousCol(col, 1),
		(col << getDataType) == "Character",
			imputeCharacterCol(col, 1),
	);
);

Open 'Big Class', make some cells missing in the listed columns, then try it out.