cancel
Showing results for 
Show  only  | Search instead for 
Did you mean: 
  • Learn how to build custom Python data connectors and further customize JMP’s Data Connector Framework with the Python Data Connector Demo, available now in the JMP Marketplace!
  • See how to create experiments to support product design and ID useful product features. Register for June 12 webinar, 2pm US Eastern Time.

Discussions

Solve problems, and share tips and tricks with other JMP users.
Choose Language Hide Translation Bar
minion
Level II

JMP18: python: conversion from pandas to dataTable skips columns with missing values

import jmp, numpy as np, pandas as pd
import jmputils

def p2j(df):
	dt2 = jmp.DataTable('dt2', df.shape[0])
	# get the column names from the data frame
	names = list(df.columns)
	# loop across the columns of the data frame
	for j in range( df.shape[1]):
		# check if the coulumn data type is string or numeric
		if is_string_dtype(df[names[j] ]):
			dt2.new_column(names[j], jmp.DataType.Character)
		else:
			dt2.new_column(names[j], jmp.DataType.Numeric)
		# populate the JMP column with data
		dt2[j] = list(df.iloc[:,j])
	return(dt2)

fname = r"export.csv";
df = pd.read_csv(fname)
filt_csv = p2j(df);

Using above code to read in a csv file. The p2j function seems to not import col3 correctly as it has missing values

minion_0-1717096857056.png

 

csv file contents

"col1","col2","col3"
"a","b","c"
"a","b",""
"a","b","e"
"a","b",""

 

1 ACCEPTED SOLUTION

Accepted Solutions

Re: JMP18: python: conversion from pandas to dataTable skips columns with missing values

Updated code shown here, if the column type is not numeric or string, check until we get a non-missing value before determining the column type as character or numeric.

 

import jmp
import numpy as np, pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

def p2j(df):
	dt2 = jmp.DataTable('dt2', df.shape[0])
	# get the column names from the data frame
	names = list(df.columns)
	# loop across the columns of the data frame
	for j in range( df.shape[1]):
		# check if the coulumn data type is string or numeric
		if is_numeric_dtype( df[ names[j] ] ):
			dt2.new_column(names[j], jmp.DataType.Numeric)
			dt2[j] = list(df.iloc[:,j])			# populate the JMP column with data
		elif is_string_dtype( df[ names[j]] ):
			dt2.new_column(names[j], jmp.DataType.Character)
			dt2[j] = list(df.iloc[:,j])			# populate the JMP column with data
		else:
			col = list()
			dtype = None
			for i in range(dt2.nrows):
				cell_value = df.iloc[i,j]
				# delay setting column type until we see first non-null value
				if pd.isna( cell_value ):
					col.append(None)
				else:
					if not dtype:
						if is_numeric_dtype( cell_value ):
							dtype = jmp.DataType.Numeric
						else:
							# assume it's character - could be datetime64,... 
							dtype = jmp.DataType.Character
					col.append( cell_value )
			print(col)
			if dtype: 
				dt2.new_column(names[j], dtype)
				dt2[j] = col				# populate the JMP column with data
		
	return(dt2)

fname = r"export.csv";
df = pd.read_csv(jmp.HOME + 'Downloads/' + fname)
print(df)
print(df.dtypes)
print( f'col1 is a string: {is_string_dtype(df[names[0]])}' )
print( f'col2 is a string: {is_string_dtype(df[names[1]])}' )
print( f'col3 is a string: {is_string_dtype(df[names[2]])}' )

filt_csv = p2j(df);

This gives the results:

 

/*:

  col1 col2 col3
0    a    b    c
1    a    b  NaN
2    a    b    e
3    a    b  NaN


col1    object
col2    object
col3    object
dtype: object


col1 is a string: True


col2 is a string: True


col3 is a string: False


['c', None, 'e', None]

Notice in my function I converted the NaN which is numeric to None.  The JMP DataTable code will accept None either in Numeric columns or Character columns and does the 'right' thing.  By putting a NaN in a numeric column and "" empty string in a Character column.

View solution in original post

3 REPLIES 3

Re: JMP18: python: conversion from pandas to dataTable skips columns with missing values

The short answer is that is_string_dtype( col3 ) returns False.  It is not a string column, it is a column of objects, some are strings some are NaN.  The code above treats everything that is not a string as numeric. The NaN's go in properly as missing, but the character values 'c' and 'e' are invalid for a numeric column and become missing values.  I'm looking for a better answer on reading the pandas column as character, without testing every value.

Re: JMP18: python: conversion from pandas to dataTable skips columns with missing values

Updated code shown here, if the column type is not numeric or string, check until we get a non-missing value before determining the column type as character or numeric.

 

import jmp
import numpy as np, pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

def p2j(df):
	dt2 = jmp.DataTable('dt2', df.shape[0])
	# get the column names from the data frame
	names = list(df.columns)
	# loop across the columns of the data frame
	for j in range( df.shape[1]):
		# check if the coulumn data type is string or numeric
		if is_numeric_dtype( df[ names[j] ] ):
			dt2.new_column(names[j], jmp.DataType.Numeric)
			dt2[j] = list(df.iloc[:,j])			# populate the JMP column with data
		elif is_string_dtype( df[ names[j]] ):
			dt2.new_column(names[j], jmp.DataType.Character)
			dt2[j] = list(df.iloc[:,j])			# populate the JMP column with data
		else:
			col = list()
			dtype = None
			for i in range(dt2.nrows):
				cell_value = df.iloc[i,j]
				# delay setting column type until we see first non-null value
				if pd.isna( cell_value ):
					col.append(None)
				else:
					if not dtype:
						if is_numeric_dtype( cell_value ):
							dtype = jmp.DataType.Numeric
						else:
							# assume it's character - could be datetime64,... 
							dtype = jmp.DataType.Character
					col.append( cell_value )
			print(col)
			if dtype: 
				dt2.new_column(names[j], dtype)
				dt2[j] = col				# populate the JMP column with data
		
	return(dt2)

fname = r"export.csv";
df = pd.read_csv(jmp.HOME + 'Downloads/' + fname)
print(df)
print(df.dtypes)
print( f'col1 is a string: {is_string_dtype(df[names[0]])}' )
print( f'col2 is a string: {is_string_dtype(df[names[1]])}' )
print( f'col3 is a string: {is_string_dtype(df[names[2]])}' )

filt_csv = p2j(df);

This gives the results:

 

/*:

  col1 col2 col3
0    a    b    c
1    a    b  NaN
2    a    b    e
3    a    b  NaN


col1    object
col2    object
col3    object
dtype: object


col1 is a string: True


col2 is a string: True


col3 is a string: False


['c', None, 'e', None]

Notice in my function I converted the NaN which is numeric to None.  The JMP DataTable code will accept None either in Numeric columns or Character columns and does the 'right' thing.  By putting a NaN in a numeric column and "" empty string in a Character column.

Re: JMP18: python: conversion from pandas to dataTable skips columns with missing values

Using the above p2j( ) and an updated export.csv with the a column of numeric data having a missing value as the first value gives:

 

Paul_Nelson_0-1717118219358.png

% more ~/Downloads/export.csv

"col1","col2","col3","col4"

"a","b","c",

"a","b","",2.718

"a","b","e",1.618

"a","b","",3.14159

Recommended Articles