// JSL Companion: Applications of the JMP Scripting Language // Title: 9_Extra_ShortestEditDistance.jsl // Version: Windows JMP® Pro 13.2.1 // Purpose: Demonstrate JMP ShortestEditScript() function syntax and use. /*--------------------------------------------------------------------------------------------- Since the late 20th century many algorithms and scripts have been written for text "mining"; for example, - to detect plagarism - spell checking for automated data cleansing - genetic code sequencing - speech recognition and more. Vladimir Levenshtein (1965) created an algorithm to compute a metric for string matching. It is named the Levenshtein Distance (LD), but often called the "shortest edit distance." The algorithm defines the steps to convert string (sequence) A, the initial state, to string (sequence) B, the goal state. There are variants the dynamic programming task; Smith-Waterman is often used to find the longest common sequence. The steps are either Insert, Remove or Common. Some variations define a Replace for a sequence of remove and insert of the same length. Also the metric could be - the number of edit steps, - the longest common sequence, as a percent - in some instances metrics are scored by delete/insert by length Of course, in text mining, desirable characteristics might be to ignore: case, unprintable characters, spacing and syntax, or common words. The JMP Utility Function ShortestEditScript() is an implementation of the Levenshtein algorithm. As of version 13.2.1, the Scripting Index provides the most documentation on its syntax and usage, and the documentation is terse. Many thanks to JMP Support for providing additional information and an examples to answer our questions. If interested, you should do a web search for Levenshtein Distance. Also included in this script is a conversion of a Java script provided at the referenced stackoverflow link. It was written to compare the output from the JMP ShortestEditScript(). //--------Syntax ------------------------------------------------------------------ Syntax 1: defaults, simple string list = ShortestEditScript(A,B); A and B are strings and the output list shows the steps (Insert, Remove, Common) to convert string A to String B. Syntax 2: strings result = ShortestEditScript(strings(A, B, , , )); If matrix(0) or not specified, the result is the list of steps. Limit(n) will stop the function if more than n inserts and deletes have been found. **** waiting on JMP support for ignore and ignoreWhitespace() *** Syntax 3: lines result = ShortestEditScript(lines(A, B, separators("default is newline"), , , )); **** arguments are the same as syntax 2, with the added separators(), need to test more separators. Syntax 4: sequences matrix = ShortestEditScript(sequences(nA, nB, Function({iA, iB}, adata[iA] == bdata[iB]) )); Sequences returns a matrix. adata and bdata will be a vector or a list of values. Note below there is an an example of using table columns as the sequences. **** tested on numeric and character data, but not yet on an expression or a list or a matrix */ //==== Define Functions ========================================================================== // See Chapter 4 for the general syntax for a user defined function: fname = Function({arguments}, {local variables}, code ); // This function for Levenshtein distance counts a replace (delete, insert) as 1 step. // Thi is our conversion of the algorithm from // http://stackoverflow.com/questions/17889861/how-to-calculate-matching-score-between-two-string-in-java //web ("http://stackoverflow.com/questions/17889861/how-to-calculate-matching-score-between-two-string-in-java"); computeLevenshteinDistance = Function({str1, str2}, {len1, len2, distance, i, j, idl1, idl2}, len1 = length(str1); len2 = length(str2); idl1 = words(str1,""); idl2 = words(str2,""); distance = J(len1 + 1,len2 + 1, 0); for (i = 2, i <= len1+1, i++, distance[i, 1] = i-1); for (j = 2, j <= len2+1, j++, distance[1, j] = j-1); for (i = 2, i <= len1 +1, i++, for (j = 2, j <= len2+1, j++, distance[i, j] = minimum(distance[i - 1, j] + 1, distance[i, j - 1] + 1, distance[i - 1, j - 1] + ( idl1[i-1] != idl2[j - 1] )) ) ); // show (distance); // uncomment to see the temporary matrix distance[len1+1, len2+1]; ); //other algorithm implementations //web("https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java"); //--- List of string examples, note they are in s1, s2 pairs exTextList ={ {"oklahoma", "oklwhoma"}, {"kitten","sitting"}, {"\[Token is invalid. DeviceId = deviceId: "345" ]\", "\[Token is invalid. DeviceId = deviceId: "123" ]\"}, {"Sauturday", "Saturdai"}, {"AGGCTATCACCTGACCTCCAGGCCGATGCCC", "TAGCTATCACGACCGCGGTCGATTTGCCCGAC"}, {"INTEN*TION", "*EXECUTION"}, {"Spokesman confirms senior government adviser was shot", "Spokesman said the senior adviser was shot dead"}, {"AGCACACA", "ACACACTA"}, {"2345234523456", "1234561234567"} }; //--- reminder the stackoverflow counts a remove/insert of a For( i = 1, i <= N Items( exTextList ), i++, s1 = exTextList[i][1]; s2 = exTextList[i][2]; xxLD = computeLevenshteinDistance( s1, s2 ); llSED = Shortest Edit Script( s1, s2 ); Print( "\!N", "Example: " || Char( i ), "s1 = " || s1, "s2 = " || s2, "LD = " || Char( xxLD ), "result: ", llSED, Repeat( "-", 90 ) ); ); //--- run to here look at the Log keep in mind a remove insert of same length counts as 1 in the scripted distance function //------------------------------------------------------------------------------------------------------------------------------ //Next set of examples explores options /* The matrix output n x 4 where n = nitems(llSED) Column1: -1 | 1| 0 -1-->remove, 1-->insert, 0-->common Column2: position in the 1st string .-->missing / not found Column3: position in the 2nd string .-->missing / not found Column4: length */ //Compare list and matrix output for strings run in blocks //---------------------------------------------------- s1 = exTextList[2][1]; //kitten s2 = exTextList[2][2]; //sitting llSED = shortesteditscript(strings(s1,s2)); mmSED = shortesteditscript(strings(s1,s2,matrix(1))); show(s1, s2, llSED, mmSED); /* llSED = {{"Remove", "k"}, {"Insert", "s"}, {"Common", "itt"}, {"Insert", "i"}, {"Remove", "e"}, {"Common", "n"}, {"Insert", "g"}}; mmSED = [ -1 1 . 1, //remove k 1 . 1 1, //insert s 0 2 2 3, //common itt in position 2 for both 3 long 1 . 5 1, //insert i -1 5 . 1, //remove e 0 6 6 1, //common n 1 . 7 1] //insert g */ //---------------------------------------------------- //sequences steps to convert the 1st sequence (list | column | vector) to the 2nd sequence // sequences(na, nb, Function({ia, ib}, adata[ia] == bdata[ib])) //output is always a matrix s1 = words(exTextList[2][1],""); //kitten a sequence of letters s2 = words(exTextList[2][2], ""); //sitting a sequence of letters n1 = nitems(s1); n2 = nitems(s2); mmSED = shortesteditscript(sequences(n1,n2,Function({a,b},s1[a]==s2[b]))); show(s1, s2, llSED, mmSED); //---------------------------------------------------- //row vector rv1 = [ 1 2 3 4 5]; //row vector 1 row 5 col rv2 = [ 1 2 4 6 7 8]; //row vector 1 row 7 col mmSED = shortesteditscript( sequences(ncols(rv1),ncols(rv2), function({ia,ib}, rv1[ia] == rv2[ib])) ); show(rv1, rv2, mmSED); /*: rv1 = [1 2 3 4 5]; rv2 = [1 2 4 6 7 8]; mmSED = [ 0 1 1 2, //common 2 values 1 2 both in the 1st position -1 3 . 1, //remove 1 value in rv1 the 3rd position 3 0 4 3 1, //common 1 value rv1[1,4] = 4 = rv2[1,3] 1 . 4 3, //insert 3 values in rv1 from rv2[1,4::6] = [6 7 8] -1 5 . 1]; //remove 1 value rv1[1,5] = 5 */ //---------------------------------------------------- //lines lines(s1,s2,separators(), ...) //lines if no separators it defaults to newline, separator of space compares words, steps to make //1st "sentence" to 2nd "sentence", note separators act like delimiters for Item() not like delimiters for Word() //example later //spokseman example s1 = exTextList[7][1]; s2 = exTextList[7][2]; llSED = shortesteditscript(lines(s1,s2, separators(" "))); mmSED = shortesteditscript(lines(s1,s2, separators(" "),matrix(1))); show(s1, s2, llSED, mmSED); //---------------------------------------------------- //example using words //token example s1 = exTextList[3][1]; s2 = exTextList[3][2]; llSED = shortesteditscript(lines(s1,s2, separators(" ") )); mmSED = shortesteditscript(lines(s1,s2, separators(" "), matrix(1))); show(s1, s2, llSED, mmSED); /* 3 word changes llSED = {{"Common", "Token is invalid. DeviceId = deviceId: "}, {"Remove", "\!"345\!" "}, {"Insert", "\!"123\!" "}}; mmSED = [ 0 1 1 6, //common 6 words -1 7 . 1, //remove 1 word the 7th word of s1 1 . 7 1]; //add 1 word the 7th word of s2 */ //------------------------------------------------------------------------------------------------------------------------------ //The next series of examples will take careful review. They capture the authors' learning .. //with help from JMP Support. It is yet another example of "know your tools" //---------------------------------------------------- s1="This is a HUGE test."; s2= "This is a \!"HUGE\!" test."; //only difference is the added quotes //To use escapd characters as delimiters they need to be doubled, goal is to ignore the \!" //So double the \!\! to get one \! and to add quote inside quoted text use \!" //With a space separator, lines is comparing a list of words l1 = shortesteditscript(lines(s1,s2,separators(" "), ignore("\!\!\!""))); //ignores \!" l2 = shortesteditscript(lines(s2,s1,separators(" "), ignore("\!\!\!""))); show(s1, s2, l1,l2); /*Note with the \!" ignored, the words (lines) are all common. However, the first "sentence" is reported. This could be confusing. Consider removing the values to be ignored, like punctuation, and maybe convert to one case before comparing; unless those items are important. See the next example and how it can get really confusing. */ //---------------------------------------------------- // Note that \!N is a new line aa = "this is \!Na test of \!Nshortest \!Nedit script lines \!Nwith several words"; bb = "this is a \!Ntest of \!Nshortest \!Nedit script lines with several \!Nwords"; l1 =shortesteditscript(lines(aa,bb,separators(" "), ignore("\!\!\!N "))); l2 =shortesteditscript(lines(bb,aa,separators(" "), ignore("\!\!\!N "))); show(aa, bb,l1,l2); // Since newline is ignored the two "sentences" match //---------------------------------------------------- //this is the same as the previous example, but it uses a different method //use both space and newline as separators run in small blocks //--- //---We expected the same result. Take a look at the results from the function words w1 = words(aa, "\!\!\!N "); w2 = words(bb, "\!\!\!N "); show( w1, w2, w1==w2 ); //---The same! now look at SED l1 =shortesteditscript(lines(aa,bb,separators(" \!\!\!N"))); l2 =shortesteditscript(lines(bb,aa,separators(" \!\!\!N"))); show(l1, "\!n\!n", l2); //---Hard to read so remove the commons d1=l1; d2=l2; for(i=nitems(d1), i>0, i--, if(d1[i][1]=="Common", RemoveFrom(d1,i)); ); for(i=nitems(d2), i>0, i--, if(d2[i][1]=="Common", RemoveFrom(d2,i)); ); show(d1,"\!n\!n", d2); //look at the Log. Our first reaction was HUH? /* we repalce the newline break with the symbol "\!N"" d1 = {{"Insert", "a "}, {"Remove", "a "}, {"Remove", "\!N}, {"Insert", "\!""}}; d2 = {{"Remove", "a "}, {"Insert", "a "}, {"Remove", "with several "}, {"Insert", "with several "}}; */ //---Huh? remove and insert the same value? Before a lengthy explanation, look at Item //There were 12 items in w1 and w2, but... i=0; found=0; aaItems={}; bbItems={}; While(!found, i++; xx = Item(i, aa, " \!\!\!N"); yy = Item(i, bb, " \!\!\!N"); insertInto(aaItems, xx); insertInto(bbItems, yy); if(xx=="words" | yy=="words", found=1) ); show(i, aaItems, bbItems); //see the log /*Note aaItems and bbItems each have 20 items compared to w1 and w2 that each have 12 items As noted earlier in chapter 4, Words() uses repeat delimiters as just one delimiter and Item() when it sees a space, newline, space will report two empty strings. that is each delimiter is counted: between space and newline there is and empty string and between newline and space, ditto. For l1, the ShortestEditScript does this: remove ( concat items({"", "", "a"}) and insert(concat items({"a", "", ""})) which of course looks like remove("a") and insert("a"). So our learning was that ShortestEditScript treats repeated delimiters like the Item() function. Which motivates our recommendations: - use only one delimiter, or you might see extra steps like Remove("a"), Insert("a") - or use the Words() function first and compare sequences. - pre-processing like removing the items to ignore and removing differences in case is something worth considering */ //-------------------------------------------------------------------------------------------------------------- //longest common sequence using exTextList 5, 8 9 exList ={5,8,9}; For(i=1, i<=nitems(exList), i++, k=exList[i]; s1 = exTextList[k][1]; s2 = exTextList[k][2]; mmSED = ShortestEditScript(strings(s1,s2,matrix(1))); idx = loc(mmSED[0,1]==0); //find common mlen = Max(mmSED[idx,4]); //what is the max length idy = loc(mmSED[idx,4]==mlen); //take the 1st if there is a tie //now get the key location irow = idx[idy[1]]; maxstr = substr(s1, mmSED[irow,2], mmSED[irow,4]); show(s1, s2, mmSED, irow, maxStr); ); //end for i //---------------------------------------------------------------------------------------------------------------- //This is getting long, but just run from here the the end. //This is an example of sequences and using data table columns. Color coding helps //create table 1 t1 = New Table( "t1", New Column( "Column 1", Numeric, Continuous, Format( "Best", 12 ), Set Values( [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7]] ) ), New Column( "Column 2", Character, Nominal, Set Values( {"a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "g"} ) ) ); //create table 2 t2 = New Table( "t2", New Column( "Column 1", Numeric, Continuous, Format( "Best", 12 ), Set Values( [[2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 6]] ) ), New Column( "Column 2", Character, Nominal, Set Values( {"b", "c", "d", "e", "b", "c", "d", "e", "b", "c", "d", "e", "e"} ) ) ); ////////////////////////// //matching on column 1 numeric values EditScript = Shortest Edit Script( sequences( N Rows( t1 ), N Rows( t2 ), Function( {a, b}, t1:column 1[A] == t2:column 1[B] ) ) ); //color cells t1: red deletes, t2 green inserts for column 1 For( i = 1, i <= N Rows( EditScript ), i++, If( EditScript[I, 1] == 1, // insert t2:column 1 << colorcells( "green", EditScript[I, 3] :: EditScript[I, 3] + EditScript[I, 4] - 1 ), EditScript[I, 1] == -1, //remove t1:column 1 << colorcells( "red", EditScript[I, 2] :: EditScript[I, 2] + EditScript[I, 4] - 1 ) ) ); ///////////////////////////////// //matching on the 2nd columns string values EditScript = Shortest Edit Script( sequences( N Rows( t1 ), N Rows( t2 ), Function( {a, b}, t1:column 2[A] == t2:column 2[B] ) ) ); //color cells t1: red deletes, t2 green inserts for column 2 For( i = 1, i <= N Rows( EditScript ), i++, If( EditScript[I, 1] == 1, // add t2:column 2 << colorcells( "green", EditScript[I, 3] :: EditScript[I, 3] + EditScript[I, 4] - 1 ), EditScript[I, 1] == -1, // delete t1:column 2 << colorcells( "red", EditScript[I, 2] :: EditScript[I, 2] + EditScript[I, 4] - 1 ) ) );