/*
THIS IS NOT AN HTML PARSER
HTML parsers are hard to write. There are better ways.
Neither of these JSL ideas can properly skip comments or
many other HTML tags that should be skipped. They will
find things they shouldn't and miss other things they
should find. They've only been minimally tested on one site.
This code is presented as "how can I use contains() vs patmatch()
to efficiently work through a large string looking for something?"
The second example is FIVE TIMES FASTER and a LITTLE MORE ACCURATE.
The third example uses text explorer to grab links. I don't think
there is a pre-existing regex for the link text descriptions. It works
very similar to this code; it is NOT an HTML parser either.
If you can use python, you might want to investigate "beautiful soup".
I've not used it, but believe it addresses the "hard to write" issue.
*/
u = "https://community.jmp.com/t5/Discussions/bd-p/discussions";
txt = Load Text File( u );
// a link on a page has at least two parts:
// the URL and some descriptive text.
// <a ... href="url" ... > descripton </a>
// p1 p4 p5
// to parse ALL the links on a page, you'll want
// some sort of loop. There are many ways to
// write that loop; here are two choices
// using contains and regex. "<a " will be our search token
// and contains() will be our workhorse. Use regex where appropriate.
// this will find links that it should not find because it does not skip script sections!
dt = New Table( "regex", New Column( "description", Character, "Nominal" ), New Column( "link", Character, "Nominal" ) );
Wait( .1 );
dt << beginDataUpdate;
startTime = Tick Seconds();
pos = 0;
while( (p1 = Contains( txt, "<a ", pos )) != 0, // as long as we can find the start of a tag
p4 = Contains( txt, ">", p1 + 3 ); // find the end of the opening tag
p5 = Contains( txt, "</a>", p4 + 1 ); // find the ending tag
if( p5 > p4, // as long as the end is not zero, we found one, see break() below
desc = Substr( txt, p4 + 1, p5 - p4 - 1 ); // the visible link description. Images can be here too.
desc = Regex( desc, "<[^>]*>", "", globalreplace ); // remove span, image, etc tags
linktext = Substr( txt, p1, p4 - p1 + 1 ); // <a href = "/">
hreftext = Regex( linktext, "href\s*=\s*(\!"|')(.*?)(\1)", "\2" );// use regex to get the href value
if( !Is Missing( hreftext ), // sometimes there isn't one
dt << addrows( 1 );//
irow = N Rows( dt );//
dt:link[irow] = hreftext;//
dt:description[irow] = desc; //
);//
, // else
Break() // no more end tags. there is maybe a mess of javascript.
);//
pos = Max( pos, p5 + 3 ); // advance past the one just found
Wait( 0 );
);
stoptime = Tick Seconds();
dt << endDataUpdate;
Show( stoptime - starttime ); // .5 sec, 127 links
// using pattern matching and regex
// this is about 5X faster and skips over the garbage in the <script> sections
// two patterns, one for <a> links and one for <script> to skip over
linkpat = "<a " + Pat Break( ">" ) >> linktext + ">" + Pat Arb() >> desctext + "</a>";
scriptpat = "<script" + Pat Break( ">" ) + ">" + Pat Arb() + "</script>";
dt = New Table( "patmatch", New Column( "description", Character, "Nominal" ), New Column( "link", Character, "Nominal" ) );
Wait( .1 );
dt << beginDataUpdate;
startTime = Tick Seconds();
rc = patmatch( txt,
patrepeat( // repeat the pattern until no more
(linkpat + pattest(// run some JSL for the linkpat...
desc = Regex( desctext, "<[^>]*>", "", globalreplace ); // remove span, etc tags
hreftext = Regex( linktext, "href\s*=\s*(\!"|')(.*?)(\1)", "\2" );//
if( !Is Missing( hreftext ), //
dt << addrows( 1 );//
irow = N Rows( dt );//
dt:link[irow] = hreftext;//
dt:description[irow] = desc; //
);//
1; // pattest needs this to succeed
)) | scriptpat /* maybe skip a script */ | /* maybe skip to the next tag */ (Pat Len( 1 ) + Pat Break( "<" ))
),
NULL,IGNORECASE
);
stoptime = Tick Seconds();
dt << endDataUpdate;
Show( stoptime - starttime ); // .5 sec, 128 links
Show( rc ); // should be 1, not 0
///////////////////////////////////////
// text explorer can also do this:
dt=New Table( "textexplorer",
Add Rows( 1 ),
New Column( "text", Character, "Nominal", Set Values( evallist({txt}) ) )
);
dt<<Text Explorer(
Text Columns( :text ),
Set Regex( Library( "HTML Link Grabber" ) ),
Language( "English" ),
SendToReport(
Dispatch(
{"Term and Phrase Lists"},
"",
TableBox,
{Sort By Column( 2, 1 )}
)
)
);
Craige