I added stop words and color to the output. Also made dt an invisible table.
cloud_title = "Alice in Wonderland";
// Put all the text into one variable
txt = load text file("c:\temp\alice.txt");
aa = [=> 0]; // associativearray with default value of zero
// count the words. JMP has a words() function that returns a list of words, but it isn't
// able to distinguish apostrophes inside words from on edges of words.
// 'we're is an example this regex works around. as is 'Oh.
// 'Oh, you can't help that,' said the Cat: 'we're all mad here. I'm mad. You're mad.'
rc = Pat Match(
txt,
Pat Repeat(
(Pat Regex( "[\w]+('[\w]+)?" ) >> word + Expr(
word = Uppercase( word );
aa[word]++;
"";
)) | (Pat Regex( "[^\w]+" )),
1,
9999999,
GREEDY
)
);
keys = aa << getkeys;
//show(N Items( keys ));
vals = aa << getvalues;
//show(N Items( vals ));
dt = New Table( "",
New Column( "word", character, Set Values( keys ) ),
New Column( "count", Numeric, "Continuous", Format( "Best", 12 ), Set Values( vals ) ),
invisible
);
dt << sort( by( count ), order( descending ), replace table );
// Words to ignore
stop_words = {"THE", "WAS", "NOT", "AND", "ON", "A", "WITH", "FROM", "THIS", "OF", "WERE", "FOR", "TO",
"AN", "BY", "IT", "OR", "AS", "HAD", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "AT", "BE", "IN",
"DID", "THAT", "NO", "ALSO", "IS", "MAY", "BUT", "HAS", "HER", "SHE", "HE", "HIS", "BEEN", "00",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "WHERE",
"IE", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R",
"S", "T", "U", "V", "W", "X", "Y", "Z", "HAVE", "LIKE", "DE", "THERE", "ALL", "GOT",
"WHO", "AFTER", "ANY", "ABOUT", "COMPANY", "NEXT", "THEY", "IF", "THESE", "THEN", "HOWEVER", "MY"};
del_rows = dt << get rows where( Contains( stop_words, :word ) );
dt << delete rows( del_rows );
New Window( "Text Word Cloud",
Outline Box( cloud_title ,
Border Box( Left( 10 ), Right( 10 ), top( 10 ), bottom( 10 ), v = V List Box() )
)
);
h = H List Box();
// List of "good" colors from Scripting Guide, page 341
color_list =
[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 35, 36, 37, 38, 39, 40, 43, 44, 45, 46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62];
ncolors = nrows(color_list);
row = 1;
k = 1;
nr = nrows(dt);
min_words = 8;
size_multiplier = 1;
if (nr < 100,
min_words = 2;
size_multiplier = 2;
);
While( dt:count[row] > min_words,
x = Text Box( dt:word[row] || " " );
x << setfontsize( size_multiplier * Log( dt:count[row] ) * 4 );
x << font color(color_list[k]);
k++;
if (k > ncolors,
k = 1;
);
h << append( x );
If( h << getwidth > 700,
h = H List Box();
v << append( h );
);
row++;
Wait( 0 );
);
wait(0);
close(dt, nosave);