해결됨: columns data split and rename column headers

부적절한 컨텐트 신고 · Jun 9, 2023 3:09 PM

안녕,

저는 | 또는 _ 또는 none의 많은 열이 있는 Data_original.csv를 가지고 있습니다. 이 예에서 저는 |의 3개 열만 보여줍니다.

데이터 미리보기를 통해 분할이 가능한지 확인하려고 했는데 열이 엉망이 됐어요.

Data_original.csvFruit,ID,Colors,Weigth,CountryCodeApple,123456,5_2|1_4,3|3|3|oz,1_2_5|3_2_1_123_2343_342|1_2Apricot,5555666,9_0|55_298,5|3|2|1|lb,0_9_1|23Grape,98746,12_1|23_99,1|2|1|7|6|4|2|1|8|g,9_1_3|23_45_4Orange,25897,8_8|0_9,2|3|fl,1_0_4|99_999_888|0_8Pear,113355,9999_77|78_123,4|6|l,-9999_76_123|9Plum,8855446,32_-8888|00_12,5|9|lb,123_1_55|123_90Strawberries,981254,981_123|34_123,8|3|g,89_12_45|67_1_23_8|90Open(
	"C:\JSL\Data_original.csv",
	Import Settings(
		End Of Line( CRLF, CR, LF ),
		End Of Field( Comma, Other( "|" ), CSV( 0 ) ),
		Strip Quotes( 1 ),
		Use Apostrophe as Quotation Mark( 0 ),
		Scan Whole File( 1 ),
		Treat empty columns as numeric( 0 ),
		CompressNumericColumns( 0 ),
		CompressCharacterColumns( 0 ),
		CompressAllowListCheck( 0 ),
		Labels( 1 ),
		Column Names Start( 1 ),
		Data Starts( 2 ),
		Lines To Read( "All" ),
		Year Rule( "20xx" )
	)
);

먼저 Data_original.csv를 열고 싶습니다. | split의 모든 열을 삭제합니다(만약 있다면, 첫 번째 값 split만 유지합니다). split한 후에는 Column*이 있는 모든 열 머리글 이름을 삭제하고 csv에 저장할 수 있을 것이라고 생각했습니다.

Data_original_EDIT1.csv의 출력을 확인하세요.

Fruit,ID,Colors,Weigth,CountryCodeApple,123456,5_2,3,1_2_5Apricot,5555666,9_0,5,0_9_1Grape,98746,12_1,1,9_1_3Orange,25897,8_8,2,1_0_4Pear,113355,9999_77,4,-9999_76_123Plum,8855446,32_-8888,5,123_1_55Strawberries,981254,981_123,8,89_12_45

두 번째로, 이전에 저장한 csv인 Data_original_EDIT1.csv를 열고 싶습니다. _ 열로 분할하고(있는 경우) 열 머리글을 같은 열 머리글로 이름을 바꾸지만 시퀀스 0..n을 연결합니다.

최종 출력을 csv로 내보낸 Data_original_EDIT2_Final.csv를 참조하세요.

Fruit,ID,Colors0,Colors1,Weigth,CountryCode0,CountryCode1,CountryCode2
Apple,123456,5,2,3,1,2,5
Apricot,5555666,9,0,5,0,9,1
Grape,98746,12,1,1,9,1,3
Orange,25897,8,8,2,1,0,4
Pear,113355,9999,77,4,-9999,76,123
Plum,8855446,32,-8888,5,123,1,55
Strawberries,981254,981,123,8,89,12,45

JSL 코딩을 어떻게 시작하는지 알려주세요. 감사합니다.

txnelson · Mar 28, 2021 11:29 PM

여기에 더 나은 버전의 코드가 있으며 모든 | 데이터 테이블의 열

Names Default To Here( 1 );
dt = Open(
 "C:\JSL\Data_original.csv",
 Import Settings(
  End Of Line( CRLF, CR, LF ),
  End Of Field( Comma, CSV( 0 ) ),
  Strip Quotes( 1 ),
  Use Apostrophe as Quotation Mark( 0 ),
  Use Regional Settings( 0 ),
  Scan Whole File( 1 ),
  Treat empty columns as numeric( 0 ),
  CompressNumericColumns( 0 ),
  CompressCharacterColumns( 0 ),
  CompressAllowListCheck( 0 ),
  Labels( 1 ),
  Column Names Start( 1 ),
  Data Starts( 2 ),
  Lines To Read( "All" ),
  Year Rule( "20xx" )
 )
);
stopVal = Min( N Rows( dt ), 100 );

// Loop across columns back to front to expand the columns as necessary
For( theColumn = N Cols( dt ), theColumn >= 3, theColumn--, 

// Process if a character column
 If( Column( dt, theColumn ) << get data type == "Character", 
// See if a "|" is in the data value
  For( i = 1, i <= stopVal + 1, i++,
   If( Contains( Column( dt, theColumn )[i], "|" ),
    Break()
   )
  );
  
  // Get rid of everything after the first "|"
  If( i <= stopVal,
   For( theRow = 1, theRow <= N Rows( dt ), theRow++,
    Column( dt, theColumn )[theRow] = Word( 1, As Column( dt, theColumn )[theRow], "|" )
   );
   
   // Determine the potential number of values
   // Loop through the column data for upto 100 rows to find max number of  values to be processed
   
   Count = 0;
   For( i = 1, i <= stopVal, i++,
    cnt = 0;
    While( Word( cnt + 1, Trim( Column( dt, theColumn )[i] ), "_" ) != "", cnt++ );
    Count = Max( Count, cnt );
   );
  
   // If Count is == 1, then just change the column to numeric
   If( Count == 1,
    Column( dt, theColumn ) << data type( numeric ) << modeling type( continuous ),
    origName = Column( dt, theColumn ) << get name;
    Column( dt, theColumn ) << set name( origName || "0" );
    Eval(
     Substitute(
       Expr(
        dt << add multiple columns( __orig__, Count - 1, after( __col__ ), numeric )
       ),
      Expr( __orig__ ), origName,
      Expr( __col__ ), Parse( ":" || Char( Column( dt, thecolumn ) << get name ) )
     )
    );
    // Change name of new column if only 2 new column is created
    If( Count == 2,
     Column( dt, origName ) << Set name( origName || "1" )
    );
    For( i = 1, i <= N Rows( dt ), i++,
     For( k = 1, k <= Count - 1, k++,
      Column( dt, origName || Char( k ) )[i] = Num( Word( k + 1, Column( dt, theColumn )[i], "_" ) )
     )
    );
    // Correct original 0 column
    For( i = 1, i <= N Rows( dt ), i++,
     Column( dt, theColumn )[i] = Word( 1, Column( dt, theColumn )[i], "_" )
    );
    Column( dt, theColumn ) << data type( numeric ) << modeling type( continuous );
   );
  );
 )
);

txnelson · Mar 28, 2021 10:16 PM

해결책은 생각보다 쉽다고 생각합니다.데이터와 데이터로 무엇을 하려는지 살펴본 후 원본 csv 파일에 있는 내용을 셀 단위로 원본 데이터를 읽는 접근 방식을 취했습니다.그런 다음 단순히 데이터를 검사하고 필요에 따라 열을 만들고 수정합니다.내 항목 하단에 있는 스크립트의 결과는 다음과 같습니다.

undefined

Names Default To Here( 1 );
dt = Open(
 "C:\JSL\Data_original.csv",
 columns(
  New Column( "Fruit", Character, "Nominal" ),
  New Column( "ID", Numeric, "Continuous", Format( "Best", 12 ) ),
  New Column( "Colors", Character, "Nominal" ),
  New Column( "Weigth", Character, "Nominal" ),
  New Column( "CountryCode", Character, "Nominal" )
 ),
 Import Settings(
  End Of Line( CRLF, CR, LF ),
  End Of Field( Comma, CSV( 0 ) ),
  Strip Quotes( 1 ),
  Use Apostrophe as Quotation Mark( 0 ),
  Use Regional Settings( 0 ),
  Scan Whole File( 1 ),
  Treat empty columns as numeric( 0 ),
  CompressNumericColumns( 0 ),
  CompressCharacterColumns( 0 ),
  CompressAllowListCheck( 0 ),
  Labels( 1 ),
  Column Names Start( 1 ),
  Data Starts( 2 ),
  Lines To Read( "All" ),
  Year Rule( "20xx" )
 )
);

// Work on the Colors data
// Only keep data up to |
For Each Row( :Colors = Word( 1, :Colors, "|" ) );

// Determine the potential number of colors
// Loop through Colors data for upto 100 rows to find max number of 
// colors
dt:Colors << set name( "Colors0" );
stopVal = Min( N Rows( dt ), 100 );
colorsCount = 0;
For( i = 1, i <= stopVal, i++,
 cnt = 0;
 While( Word( cnt + 1, Trim( :Colors0[i] ), "_" ) != "", cnt++ );
 colorsCount = Max( colorsCount, cnt );
);
dt << add multiple columns( "Colors", colorsCount - 1, after( :Colors0 ), numeric );
// Change name of new column if only 2 new column is created
If( colorsCount == 2,
 dt:Colors << Set name( "Colors1" )
);
For(i=1,i<=nrows(dt),i++,
 for(k=1,k<=colorsCount-1,k++,
  column(dt,"Colors" || char(k))[i]=num(word(k+1,:Colors0[i],"_"));
 )
);
// Correct Colors0 column
For Each Row(:Colors0 = word(1,:colors0,"_"));
dt:colors0 << data type(numeric) << modeling type(continuous);

// Correct Weigth column
for each row( :Weigth = word(1,:Weigth,"|"));
dt:Weigth << set data type(numeric) << set modeling type(continuous);

// Work on the Country data
// Only keep data up to |
For Each Row( :CountryCode = Word( 1, :CountryCode, "|" ) );

// Determine the potential number of CountryCode
// Loop through CountryCode data for upto 100 rows to find max number of 
// CountryCode
dt:CountryCode << set name( "CountryCode0" );
stopVal = Min( N Rows( dt ), 100 );
CountryCodeCount = 0;
For( i = 1, i <= stopVal, i++,
 cnt = 0;
 While( Word( cnt + 1, Trim( :CountryCode0[i] ), "_" ) != "", cnt++ );
 CountryCodeCount = Max( CountryCodeCount, cnt );
);
dt << add multiple columns( "CountryCode", CountryCodeCount - 1, after( :CountryCode0 ), numeric );
// Change name of new column if only 2 new column is created
If( CountryCodeCount == 2,
 dt:CountryCode << Set name( "CountryCode1" )
);
For(i=1,i<=nrows(dt),i++,
 for(k=1,k<=CountryCodeCount-1,k++,
  column(dt,"CountryCode" || char(k))[i]=num(word(k+1,:CountryCode0[i],"_"));
 )
);
// Correct CountryCode0 column
For Each Row(:CountryCode0 = word(1,:CountryCode0,"_"));
dt:CountryCode0 << data type(numeric) << modeling type(continuous);

이 예제는 3가지 다른 | 하지만 열 목록을 읽고 열 이름을 지정하고 첫 번째 | 이후의 모든 항목을 제거하도록 쉽게 수정할 수 있습니다. "_"의 계산은 스크립트가 발견한 만큼의 열을 처리할 수 있습니다.

txnelson · Mar 28, 2021 11:29 PM

여기에 더 나은 버전의 코드가 있으며 모든 | 데이터 테이블의 열

Names Default To Here( 1 );
dt = Open(
 "C:\JSL\Data_original.csv",
 Import Settings(
  End Of Line( CRLF, CR, LF ),
  End Of Field( Comma, CSV( 0 ) ),
  Strip Quotes( 1 ),
  Use Apostrophe as Quotation Mark( 0 ),
  Use Regional Settings( 0 ),
  Scan Whole File( 1 ),
  Treat empty columns as numeric( 0 ),
  CompressNumericColumns( 0 ),
  CompressCharacterColumns( 0 ),
  CompressAllowListCheck( 0 ),
  Labels( 1 ),
  Column Names Start( 1 ),
  Data Starts( 2 ),
  Lines To Read( "All" ),
  Year Rule( "20xx" )
 )
);
stopVal = Min( N Rows( dt ), 100 );

// Loop across columns back to front to expand the columns as necessary
For( theColumn = N Cols( dt ), theColumn >= 3, theColumn--, 

// Process if a character column
 If( Column( dt, theColumn ) << get data type == "Character", 
// See if a "|" is in the data value
  For( i = 1, i <= stopVal + 1, i++,
   If( Contains( Column( dt, theColumn )[i], "|" ),
    Break()
   )
  );
  
  // Get rid of everything after the first "|"
  If( i <= stopVal,
   For( theRow = 1, theRow <= N Rows( dt ), theRow++,
    Column( dt, theColumn )[theRow] = Word( 1, As Column( dt, theColumn )[theRow], "|" )
   );
   
   // Determine the potential number of values
   // Loop through the column data for upto 100 rows to find max number of  values to be processed
   
   Count = 0;
   For( i = 1, i <= stopVal, i++,
    cnt = 0;
    While( Word( cnt + 1, Trim( Column( dt, theColumn )[i] ), "_" ) != "", cnt++ );
    Count = Max( Count, cnt );
   );
  
   // If Count is == 1, then just change the column to numeric
   If( Count == 1,
    Column( dt, theColumn ) << data type( numeric ) << modeling type( continuous ),
    origName = Column( dt, theColumn ) << get name;
    Column( dt, theColumn ) << set name( origName || "0" );
    Eval(
     Substitute(
       Expr(
        dt << add multiple columns( __orig__, Count - 1, after( __col__ ), numeric )
       ),
      Expr( __orig__ ), origName,
      Expr( __col__ ), Parse( ":" || Char( Column( dt, thecolumn ) << get name ) )
     )
    );
    // Change name of new column if only 2 new column is created
    If( Count == 2,
     Column( dt, origName ) << Set name( origName || "1" )
    );
    For( i = 1, i <= N Rows( dt ), i++,
     For( k = 1, k <= Count - 1, k++,
      Column( dt, origName || Char( k ) )[i] = Num( Word( k + 1, Column( dt, theColumn )[i], "_" ) )
     )
    );
    // Correct original 0 column
    For( i = 1, i <= N Rows( dt ), i++,
     Column( dt, theColumn )[i] = Word( 1, Column( dt, theColumn )[i], "_" )
    );
    Column( dt, theColumn ) << data type( numeric ) << modeling type( continuous );
   );
  );
 )
);

sam_t · Mar 29, 2021 01:46 AM

일반적인 솔루션에 대해 Jim에게 감사드립니다. 나는 10 | 스플릿 및 20_ 스플릿. 완벽하게 작동합니다!!

txnelson · Mar 29, 2021 04:00 AM

@sam_t ,

나는 그것이 당신을 위해 작동 기쁩니다.

스크립트 제공에 대해 제가 원하는 대가는 여러분이 시간을 내어 스크립트를 한 줄씩 살펴보고 이해하고 배울 수 있도록 하는 것입니다.더 하드코딩된 내 초기 스크립트부터 시작하는 것이 좋습니다.

대부분의 스크립트와 마찬가지로 이 스크립트는 데이터의 새로운 항목으로 인해 나중에 수정해야 할 수 있으며 사용자가 변경할 수 있어야 합니다.

열 데이터 분할 및 열 머리글 이름 변경

Re: 열 데이터 분할 및 열 머리글 이름 바꾸기

Re: 열 데이터 분할 및 열 머리글 이름 바꾸기

Re: 열 데이터 분할 및 열 머리글 이름 바꾸기

Re: 열 데이터 분할 및 열 머리글 이름 바꾸기

Re: 열 데이터 분할 및 열 머리글 이름 바꾸기

추천 글

Hiding and Excluding Data

Adding Markers, Colors, and Row Legends

Binning Data Using Conditional IF-THEN Statements

Transforming Data

Creating Formulas in JMP