From 1f3106f9c2ec123655d4300fe47690c5c193f7da Mon Sep 17 00:00:00 2001 From: Mat Allen Date: Thu, 31 Jan 2019 13:48:19 -0500 Subject: [PATCH] update --- src/main/java/com/redhat/sso/Controller2.java | 876 ++++++++++++++++++ 1 file changed, 876 insertions(+) create mode 100644 src/main/java/com/redhat/sso/Controller2.java diff --git a/src/main/java/com/redhat/sso/Controller2.java b/src/main/java/com/redhat/sso/Controller2.java new file mode 100644 index 0000000..5712d1b --- /dev/null +++ b/src/main/java/com/redhat/sso/Controller2.java @@ -0,0 +1,876 @@ +package com.redhat.sso; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.ProtocolException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import javax.net.ssl.HttpsURLConnection; +import javax.servlet.http.HttpServletRequest; +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.PathParam; +import javax.ws.rs.core.Context; +import javax.ws.rs.core.Response; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.IOUtils; +import org.apache.log4j.Logger; +import org.codehaus.jackson.JsonGenerationException; +import org.codehaus.jackson.map.JsonMappingException; +import org.jsoup.Jsoup; +import org.jsoup.parser.Parser; + +import com.redhat.sso.model.Document; +import com.redhat.sso.model.Offering; +import com.redhat.sso.model.Solution; +import com.redhat.sso.utils.RegExHelper; +import com.redhat.sso.utils.StrParse; + +import mjson.Json; + +@Path("/") +public class Controller2{ + private static final Logger log=Logger.getLogger(Controller2.class); + + public static void main(String[] asd) throws JsonGenerationException, JsonMappingException, IOException{ + System.setProperty("username", "redacted"); + System.setProperty("password", "redacted"); + List result=new Controller2().search("sso_searchable", "tags,subject,content", "offering_"); + System.out.println(com.redhat.sso.utils.Json.newObjectMapper(true).writeValueAsString(result)); + } + + @GET + @Path("/search") + public Response search2(@Context HttpServletRequest request) throws JsonGenerationException, JsonMappingException, IOException{ + String filter=request.getParameter("filter"); + String groupBy=null==request.getParameter("groupBy")?"offering_":request.getParameter("groupBy"); + String fields="tags,subject,content"; + return Response.status(200) + .header("Access-Control-Allow-Origin", "*") + .header("Content-Type","application/json") + .header("Cache-Control", "no-store, must-revalidate, no-cache, max-age=0") + .header("Pragma", "no-cache") + //.entity(com.redhat.sso.utils.Json.newObjectMapper(true).writeValueAsString(searchByGroup2(filter, fields, groupBy))) + .entity(com.redhat.sso.utils.Json.newObjectMapper(true).writeValueAsString(search(filter, fields, groupBy))) + .build(); + } + + private static Map truncate=new HashMap(); + @GET + @Path("/config/{action}/{field}/{value}") + public Response config(@PathParam(value="action") String action, @PathParam(value="field") String field, @PathParam(value="value") String value, @Context HttpServletRequest request) throws JsonGenerationException, JsonMappingException, IOException{ + if ("truncate".equalsIgnoreCase(action)) + truncate.put(field, value); + + System.out.println("Saving config: "+action+" "+field+" substring("+value+")"); + return Response.status(200).build(); + } + + @GET + @Path("/lastMessage") + public Response lastMessage(@Context HttpServletRequest request) throws JsonGenerationException, JsonMappingException, IOException{ + return Response.status(200) + .header("Access-Control-Allow-Origin", "*") + .header("Content-Type","application/json") + .header("Cache-Control", "no-store, must-revalidate, no-cache, max-age=0") + .header("Pragma", "no-cache") + .entity(IOUtils.toString(new FileInputStream(new File("logs/last-message-source.json")))) + .build(); + } + + private Integer priority(Document d){ + if (d.name.toLowerCase().contains("offering page")) return 0; + if (d.name.toLowerCase().contains("overview")) return 1+d.name.length(); + if (d.name.toLowerCase().contains("sales kit")) return 100; + if (d.name.toLowerCase().contains("definition")) return 200; + if (d.name.toLowerCase().contains("datasheet")) return 300; + if (d.name.toLowerCase().contains("slide")) return 400; + if (d.name.toLowerCase().contains("task")) return 500; + return 100+d.name.length(); + } + + private List search(String commonTag, String fields, String groupBy) throws IOException{ + int max=100; + + String searchUrl="https://mojo.redhat.com/api/core/v3/contents?filter=type(document,file)&filter=tag(" + commonTag + ")&fields=" + fields+"&count="+max; + + List allOfferings=new ArrayList(); + List alldocuments=new ArrayList(); + + Json response=callMojoApi(searchUrl); + alldocuments.addAll(getDocuments(response)); + while (null!=getNext(response)){ + response=callMojoApi(getNext(response)); + alldocuments.addAll(getDocuments(response)); + } + allOfferings.addAll(aggregateIntoOfferings(alldocuments, groupBy)); + return allOfferings; + } + + private String getNext(Json response){ + if (response.has("links")){ + if (response.at("links").has("next")){ + return response.at("links").at("next").asString(); + } + } + return null; + } + + private List getDocuments(Json json){ + List result=new ArrayList(); + int size=json.at("list").asJsonList().size(); + log.debug("Found "+size+" documents"); + for(int i=0;i aggregateIntoOfferings(List alldocuments, String groupBy){ + List offerings=new ArrayList(); + + List overviews=new ArrayList(); + List remove=new ArrayList(); + for(Document d:alldocuments){ + // find all overview docs + if (d.tags.contains("doc_overview") || d.tags.contains("community_offering")){ + overviews.add(d); + remove.add(d); + } + } + for(Document d:remove) alldocuments.remove(d); remove.clear(); + log.debug(overviews.size() +" overview documents found"); + + // Now we have a list of overviews, and a separate list (initial) for all other docs + + for(Document overview:overviews){ +// log.debug("Found overview: ("+String.format("%s7",overview.id) +") "+overview.name); + + if (overview.tags.contains("community_offering")){ + // then this is a submitted community offering with no linked documents so extract the values here + Offering o=new Offering(); +// o.offering=StrParse.get(overview.name).rightOf(":").trim(); + o.offering=overview.name; + o.description=extract(overview, overview.description, new String[]{"OFFERING OVERVIEW:"}); + o.type=extractType(overview); +// o.relatedProducts=Arrays.asList(extract(overview, overview.description, new String[]{"PRODUCTS/TECHNOLOGY FOCUS:"})); + o.relatedProducts=extractSectionListToDocuments("<[Hh]\\d>(.+?)", overview.description, new String[]{"PRODUCTS/TECHNOLOGY FOCUS:"}); +// o.relatedProducts=Arrays.asList(extract(overview, overview.description, new String[]{"PRODUCTS/TECHNOLOGY FOCUS:"})); + +// o.relatedProducts=extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"PRODUCTS/TECHNOLOGY FOCUS:"}); + + overview.name=StrParse.get(overview.name).rightOf(":").trim(); + o.documents.add(overview); + + String d1x=extract(overview, overview.description, new String[]{"COMMUNITY OFFERING DEFINITION DOCUMENT:"}); + o.documents.add(new Document(null, "Offering Definition Document", null, null, d1x, null)); + String d2x=extract(overview, overview.description, new String[]{"SUPPORTING DOCUMENTATION:"}); + o.documents.add(new Document(null, "Supporting Documentation", null, null, d2x, null)); + + for(Document d:o.documents){ + d.description=null; + } + + offerings.add(o); + + }else{ +// if (true) continue; //DEV ONLY DEV ONLY DEV ONLY DEV ONLY + // then this is a portfolio or standard offering, solution or program, and has associated documents to link + + Offering o=new Offering(); + + o.type=extractType(overview); + + // PROGRAM + if ("program".equalsIgnoreCase(o.type)){ + // get data from sales kit landing pages + + } + + // ################ + // ### SOLUTION ### + // ################ + if ("solution".equalsIgnoreCase(o.type)){ + // get data from sales kit landing pages + o.offering=Jsoup.parse(StrParse.get(overview.name).leftOf("-").trim()).text(); + overview.name="Sales Kit"; + o.description=extractDescription2(overview, overview.description); + o.related.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"OFFERINGS"})); + o.related.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"STANDARD OFFERINGS"})); + o.related.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"RELATED OFFERINGS"})); + + o.relatedProducts.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"TRAINING"})); + +// o.relatedProducts.addAll(extractSectionListToStrings("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"TRAINING"})); + +// Matcher m=Pattern.compile("class.*=.*\"url\".*href=\"(.+?)\".*").matcher(overview.description); + + String url=RegExHelper.extract(overview.description, ".*?docTitlePosition){ + o.offering=StrParse.get(overview.name).rightOf("-").trim(); + overview.name=StrParse.get(overview.name).leftOf("-").trim(); + }else{ + o.offering=StrParse.get(overview.name).leftOf("-").trim(); + overview.name=StrParse.get(overview.name).rightOf("-").trim(); + } + +// o.offering=StrParse.get(overview.name).rightOf("-").trim(); + o.description=extractDescription(overview, overview.description, new String[]{"DESCRIPTION:", "Description:"}); + + // System.out.println("configs: "+truncate.size()); +// if (truncate.containsKey("offering") && o.offering.length()>Integer.parseInt(truncate.get("offering"))) +// o.offering=o.offering.substring(0, Integer.parseInt(truncate.get("offering"))>o.offering.length()?o.offering.length():Integer.parseInt(truncate.get("offering")))+"..."; + +// if (truncate.containsKey("description") && o.description.length()>Integer.parseInt(truncate.get("description"))) +// o.description=o.description.substring(0, Integer.parseInt(truncate.get("description"))>o.description.length()?o.description.length():Integer.parseInt(truncate.get("description")))+"..."; + + o.relatedProducts.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"PRODUCTS & TRAINING:","Products & Training:", "Products & Training:", "PRODUCTS USED:"})); + +// o.relatedProducts.addAll(extractHtmlList(overview, overview.description, new String[]{"PRODUCTS & TRAINING:","Products & Training:", "PRODUCTS USED:"})); + // o.relatedSolutions.addAll(extractProducts(overview.description, "RELATED SOLUTIONS:")); + + o.related.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"RELATED SOLUTIONS:","Related Solutions:"})); + o.related.addAll(extractSectionListToDocuments("<[Hh]\\d.*?>(.+?)", overview.description, new String[]{"RELATED OFFERINGS:","Related Offerings:"})); + + //now, if the overview has a "Related Documents" section, then append those links too + o.documents.addAll(extractOtherDocuments2(overview, overview.description, new String[]{"OTHER MATERIALS:", "Other Materials:"})); + +// overview.name=StrParse.get(overview.name).leftOf("-").trim(); + overview.description=""; + + o.documents.add(overview); + String groupTag=""; + // find the offering tag to hunt down the related docs + for(String tag:overview.tags){ + if (tag.startsWith(groupBy)){ + groupTag=tag; break; + } + } + + // find the related docs using the groupTag + for (Document d:alldocuments){ + if (d.tags.contains(groupTag)){ + d.name=StrParse.get(d.name).leftOf("-").trim(); + d.description=""; + log.debug("Overview ("+o.offering+"):: Adding (Mojo) document -> ("+d.id+")"+d.name); + o.documents.add(d); + remove.add(d); + } + } + alldocuments.removeAll(remove); remove.clear(); + } + + log.debug("Overview ("+o.offering+") type="+o.type); + + + // ############################ + // ### VALIDATE AND FINESSE ### + // ############################ + if (o.description!=null) + o.description=o.description.replaceAll("\\?", " "); + for(Document d:Stream.concat(o.related.stream(), o.relatedProducts.stream()).collect(Collectors.toList())){ + d.alt=d.name; + d.name=truncateBefore(d.name, 40).replaceAll("\\?", " "); + d.description=null; // clear this before sending to client-side +// System.out.println("Related/Products: name = "+d.name); + } + for(Document d:o.documents){ + d.alt=d.name; + d.name=truncateBefore(d.name, 30).replaceAll("\\?", " "); + d.description=null; // clear this before sending to client-side + } + + + // re-order the documents in alphabetical order + Collections.sort(o.documents, new Comparator(){ + public int compare(Document o1, Document o2){ + return priority(o1).compareTo(priority(o2)); + }}); + + offerings.add(o); + + //// NEW - this creates the list of offerings in the "associated with" column + //if ("solution".equalsIgnoreCase(o.type)){ + // // look up all other overview docs for the same "solution_?" tag and add it as a link in the relatedSOP's list + // for(String tag:overview.tags){ + // if (tag.startsWith("solution_")){ + // for (Document d:overviews){ + // if (d.tags.contains(tag) && overview.id!=d.id /* ie. not the current overview doc */){ + // o.related.add(new Document(null, StrParse.get(d.name).rightOf("-"), null, d.url, null)); + // } + // } + // } + // } + //} + // + //// NEW - this adds all similarly tagged solutions as "associated with" documents for this offering + //if (o.type.contains("_offering")){ + // for(String tag:overview.tags){ + // + // } + //} + + + } + + } + + log.debug("aggregated into "+offerings.size() +" offerings"); + + return offerings; + } + + + private Json callMojoApi(String searchUrl) throws IOException{ +// String searchUrl="https://mojo.redhat.com/api/core/v3/contents?filter=tag(" + commonTag + ")&fields=" + fields+"&count="+max; + log.debug("calling: "+searchUrl); + HttpsURLConnection cnn=(HttpsURLConnection)new URL(searchUrl).openConnection(); + cnn.setRequestMethod("GET"); + cnn.setDoOutput(true); + login(cnn, getUsername(), getPassword()); + new File("logs").mkdirs(); + StringBuffer sb=new StringBuffer(readAndLog("logs/last-message-source.json", cnn.getInputStream())); +// return sb.toString(); + Json x=mjson.Json.read(sb.toString()); + return x; + } + + + private void login(HttpURLConnection cnn, String username, String password) throws ProtocolException{ + // java 8 +// String encoding = java.util.Base64.getEncoder().encodeToString((username+":"+password).getBytes()); + // java 7 + String encoding = Base64.encodeBase64String((username+":"+password).getBytes()); + cnn.setRequestProperty ("Authorization", "Basic " + encoding); + } + + private String readAndLog(String filepath, InputStream in) throws IOException{ + StringBuffer sb=new StringBuffer(); + BufferedReader br=new BufferedReader(new InputStreamReader(in)); + String buf; + while ((buf=br.readLine())!=null) + sb.append(buf+"\n"); + br.close(); + + FileOutputStream fos=new FileOutputStream(new File(filepath)); + IOUtils.write(sb.toString(), fos); + IOUtils.closeQuietly(fos); + return sb.toString(); + } + + private String getUsername(){ + String value=System.getenv("username"); + if (value==null) value=System.getProperty("username"); + if (value==null) System.out.println("ERROR: No username configured in environment or property variables"); + if (value==null) log.error("ERROR: No username configured in environment or property variables"); + return value; + } + + private String getPassword(){ + String value=System.getenv("password"); + if (value==null) value=System.getProperty("password"); + if (value==null) System.out.println("ERROR: No password configured in environment or property variables"); + if (value==null) log.error("ERROR: No password configured in environment or property variables"); + return value; + } + + private String extractType(Document src){ + String result=null; + for(String tag:src.getTags()){ + if ("program".equalsIgnoreCase(tag)) return "program"; + } + for(String tag:src.getTags()){ + if ("solution".equalsIgnoreCase(tag)) return "solution"; + } + for(String tag:src.getTags()){ + if ("portfolio_offering".equalsIgnoreCase(tag)) return "portfolio_offering"; + if ("community_offering".equalsIgnoreCase(tag)) return "community_offering"; + if ("standard_offering".equalsIgnoreCase(tag)) return "standard_offering"; + } + return result; + } + + private String extract(Document src, String descriptionHtml, String[] tokensInOrder){ + descriptionHtml=descriptionHtml.replaceAll(" ", " "); + String token=null; + int iDesc=-1; + for(String t:tokensInOrder){ + if ((iDesc=descriptionHtml.indexOf(t))>=0){ + token=t; + break; + } + } + if (iDesc<0){ + log.error("Unable to find \""+arrayToString(tokensInOrder)+"\" in document: "+src.getUrl()); + return "NOT FOUND: "+arrayToString(tokensInOrder); + } + + int start=descriptionHtml.substring(0, iDesc).lastIndexOf("", start+1); + end=descriptionHtml.indexOf("

", end+1); + + if (start<0 || end<0){ + return "Are you sure one of \""+arrayToString(tokensInOrder)+"\" is within <p> tags?"; + } + String description=descriptionHtml.substring(start, end); + + String result=Jsoup.parse(description, "UTF-8").text().toString().substring(token.length()).trim(); // strip any html elements (inc the header/token + +// System.out.println("DESCRIPTION = "+result); + + return result; + } + + private String extractDescription2(Document src, String descriptionHtml){ + // find span with class="description", extract all text within that span + int s=descriptionHtml.indexOf("class=\"description\""); + s=descriptionHtml.indexOf(">", s+1); + + //Matcher m=Pattern.compile("<.*span.*?class=\"description\".*?>(.+?)").matcher(descriptionHtml); + + int e=descriptionHtml.indexOf("", s); + String result=descriptionHtml.substring(s+1, e); + return Jsoup.parse(result).text(); + } + + private String extractDescription(Document src, String descriptionHtml, String[] tokensInOrder){ + String token=null; + int iDesc=-1; + for(String t:tokensInOrder){ + if ((iDesc=descriptionHtml.indexOf(t))>=0){ + token=t; + break; + } + } +// System.out.println(descriptionHtml); +// int iDesc=descriptionHtml.indexOf(token); //find DESCRIPTION +// if (iDesc<0) descriptionHtml.indexOf(token); //find Description + + if (iDesc<0){ + log.error("Unable to find \""+arrayToString(tokensInOrder)+"\" in document: "+src.getUrl()); + return "DESCRIPTION NOT FOUND"; + } + + int start=descriptionHtml.substring(0, iDesc).lastIndexOf("=0;i--) + if ((int)sb.charAt(i)<32 || (int)sb.charAt(i)>126) sb.deleteCharAt(i); + + return sb.toString().trim(); + } + + private String arrayToString(String[] a){ + StringBuffer sb=new StringBuffer(); + for(String s:a) + sb.append(s).append(", "); + return sb.substring(0, sb.length()>2?sb.length()-2:0); + } +// private List extractHtmlList(Document src, String html, String[] tokensInOrder){ +// String token=null; +// int iDesc=-1; +// for(String t:tokensInOrder){ +// if ((iDesc=html.indexOf(t))>=0){ +// token=t; +// break; +// } +// } +// +//// int iDesc=descriptionHtml.indexOf(token); +// if (iDesc<0){ +// log.error("Unable to find any \""+arrayToString(tokensInOrder)+"\" in document: "+src.getUrl()); +// return Arrays.asList("MISSING: \""+token+"\""); //abort early if the header token is not in the document +// } +// +// int ulStart=html.indexOf("ul", iDesc); +// int ulEnd=html.indexOf("/ul", ulStart); +// +// String ul=html.substring(ulStart, ulEnd); +// // now just split by
  • +// +// List result=new ArrayList(); +// +// //loop +// int end=0; +// int start=ul.indexOf("0){ +// end=ul.indexOf("
  • ", start); +// String li=ul.substring(start, end); +// String item; +// if (li.indexOf("=0){ +// int hrefStart=li.indexOf("href=")+"href=".length()+1; +// String url=li.substring(hrefStart, li.indexOf("\"", hrefStart)); +// li=li.replaceAll(" ", " "); +// item=""+Jsoup.parse(li).text().toString().trim()+""; +// }else{ +// item=Jsoup.parse(li).text().toString().trim(); +// } +//// li=li.substring(li.indexOf("")+"".length()); // strip everything except for a link if it exists +//// String item=Jsoup.parse(li).text().toString().trim(); +// result.add(item); +// start=ul.indexOf(" extractSolutions(Document src, String descriptionHtml, String[] tokensInOrder){ +// String token=null; +// int iDesc=-1; +// for(String t:tokensInOrder){ +// if ((iDesc=descriptionHtml.indexOf(t))>=0){ +// token=t; +// break; +// } +// } +// +//// int iDesc=descriptionHtml.indexOf(token); +// if (iDesc<0){ +// log.error("Unable to find \""+arrayToString(tokensInOrder)+"\" in document: "+src.getUrl()); +// return Arrays.asList(new Solution("MISSING: \""+token+"\"", null)); //abort early if the header token is not in the document +// } +// +// int ulStart=descriptionHtml.indexOf("ul", iDesc); +// int ulEnd=descriptionHtml.indexOf("/ul", ulStart); +// +// String ul=descriptionHtml.substring(ulStart, ulEnd); +// // now just split by
  • +// +// List result=new ArrayList(); +// +// //loop +// int end=0; +// int start=ul.indexOf("0){ +// end=ul.indexOf("
  • ", start); +// String li=ul.substring(start, end); +// String name=""; +// String url=null; +// if (li.indexOf("=0){ +// int hrefStart=li.indexOf("href=")+"href=".length()+1; +// url=li.substring(hrefStart, li.indexOf("\"", hrefStart)); +//// item=""+Jsoup.parse(li).text().toString().trim()+""; +// } +// name=Jsoup.parse(li).text().toString().trim(); +//// li=li.substring(li.indexOf("")+"".length()); // strip everything except for a link if it exists +//// String item=Jsoup.parse(li).text().toString().trim(); +// result.add(new Solution(name, url)); +// start=ul.indexOf(" regexCache=new HashMap(); + private Integer indexOf(String html, String regex){ + return indexOf(html, regex, 0); + } + private Integer indexOf(String html, String regex, int fromIndex){ + //Matcher m=Pattern.compile(regex).matcher(fromIndex(.+?)"); + + private List extractOtherDocuments2(Document src, String html, String[] tokensInOrder){ + Matcher m=Pattern.compile("<[Hh]\\d.*?>(.+?)").matcher(html); + int sectStart=-1; + int sectEnd=html.length(); + List tokens=Arrays.asList(tokensInOrder); + + while(m.find()){ + for (int i=1;i<=m.groupCount();i++){ + String headerTitle=Jsoup.parse(m.group(i)).text().trim(); + if (tokens.contains(headerTitle)){ // you've found a matching header + sectStart=m.start(i); + sectEnd=m.groupCount()>i?m.start(i+1):html.length(); + break; + } + + } + } + + if (sectStart<0) return new ArrayList(); + + // cut just the section we're interested in (the h1 to next start of h1 or end of doc if there is no more sections) + String htmlSubsection=html.substring(sectStart, sectEnd); + + int ulStart=-1; + ulStart=indexOf(htmlSubsection, "<[UL|ul].*>"); + + if (ulStart<0) return new ArrayList(); // exit early if we cant find a list + + int ulEnd=indexOf(htmlSubsection, ""); + + String ul=htmlSubsection.substring(ulStart, ulEnd+"".length());// cut the list section so we can parse it easier + + List result=new ArrayList(); + + Matcher m2=LI_ITERATOR_REGEX.matcher(ul); + while (m2.find()){ + String item=m2.group(1); + String name=""; + String url=null; + if (item.indexOf("=0){ + int hrefStart=item.indexOf("href=")+"href=".length()+1; + url=item.substring(hrefStart, item.indexOf("\"", hrefStart)); + } +// name=Jsoup.parse(item).text().toString().trim(); + name=Parser.unescapeEntities(Jsoup.parse(item).text(), false).trim(); + + // check name is not too long, if it is then truncate it +// name=truncate(name,30); + + //if (name.length()>30){ + // int to=name.indexOf(" ", 30); // next space after 30 chars + // if (to<0) to=name.length();// if there's no space after 30 chars then just go to the end + // name=name.substring(0, to)+"..."; + //} + + String id=null; + String description=null; + List tags=null; +// log.debug("Overview ("+o.offering+"):: Adding (Other Materials) document -> ("+url+")"+name); + result.add(new Document(id, name, null, description, url, tags)); + } + + return result; + } + + private String truncateAfter(String input, Integer length){ + // check input is not too long, if it is then truncate it + if (input.length()>length){ + int to=input.indexOf(" ", length); // next space after 30 chars + if (to<0) to=input.length();// if there's no space after 30 chars then just go to the end + input=input.substring(0, to)+"..."; + } + return input; + } + + private String truncateBefore(String input, Integer length){ + // check input is not too long, if it is then truncate it + if (input.length()>length){ + String tmp=input.substring(0, length); // last space before 30 chars + int to=tmp.lastIndexOf(" "); + if (to<0) to=input.length();// if there's no space after 30 chars then just go to the end + input=input.substring(0, to).trim()+"..."; + } + return input; + } + +// private List extractListFromSection(String matcher, String[] tokensInOrder, Document src){ +// +// } + +// private List genericExtractListFromSection(String html, String[] tokensInOrder){ +// return genericExtractListFromSection("<[Hh]\\d>(.+?)", html, tokensInOrder); +// } + +// private List extractSectionListToStrings(String matcher, String html, String[] tokensInOrder){ +// Matcher m=Pattern.compile(matcher).matcher(html); +// int sectStart=-1; +// int sectEnd=html.length(); +// List tokens=Arrays.asList(tokensInOrder); +// +// while(m.find()){ +// String headerTitle=Jsoup.parse(m.group(1)).text().trim(); +// if (tokens.contains(headerTitle)){ // you've found a matching header +// sectStart=m.start(); +// if (m.find()) +// sectEnd=m.start(); +// } +// } +// +// if (sectStart<0) return new ArrayList(); +// +// // cut just the section we're interested in (the h1 to next start of h1 or end of doc if there is no more sections) +// String htmlSubsection=html.substring(sectStart, sectEnd); +// +// int ulStart=-1; +// ulStart=indexOf(htmlSubsection, "<[UL|ul].*>"); +// +// if (ulStart<0) return new ArrayList(); // exit early if we cant find a list +// +// int ulEnd=indexOf(htmlSubsection, ""); +// +// String ul=htmlSubsection.substring(ulStart, ulEnd+"".length());// cut the list section so we can parse it easier +// +// List result=new ArrayList(); +// +// Matcher m2=LI_ITERATOR_REGEX.matcher(ul); +// while (m2.find()){ +// String item=m2.group(1); +// String name=""; +// String url=null; +// if (item.indexOf("=0){ +// int hrefStart=item.indexOf("href=")+"href=".length()+1; +// url=item.substring(hrefStart, item.indexOf("\"", hrefStart)); +// } +// name=Jsoup.parse(item).text().toString().trim(); +// name=Jsoup.parse(item).text().toString() +// .replaceAll(" ", "") +// .trim(); +// +//// // check name is not too long, if it is then truncate it +//// if (name.length()>30){ +//// int to=name.indexOf(" ", 30); // next space after 30 chars +//// if (to<0){ +//// to=name.length();// if there's no space after 30 chars then just go to the end +//// }else +//// name=name.substring(0, to)+"..."; +//// } +// +// String id=null; +// String description=null; +// List tags=null; +//// log.debug("Overview ("+o.offering+"):: Adding (Other Materials) document -> ("+url+")"+name); +// +// result.add(""+name+""); +// +//// result.add(new Document(id, name, description, url, tags)); +// } +// +// return result; +// } + + /** + * Looks through each header matching the "matcher" regex and tries to find the "tokens" (potential header titles). If it finds on then it looks for the next
      and parses that list only + * @param matcher + * @param html + * @param tokensInOrder + * @return + */ + private List extractSectionListToDocuments(String matcher, String html, String[] tokensInOrder){ + Matcher m=Pattern.compile(matcher).matcher(html); +// Matcher m=Pattern.compile(matcher, Pattern.DOTALL | Pattern.MULTILINE).matcher(html); + int sectStart=-1; + int sectEnd=html.length(); + List tokens=Arrays.asList(tokensInOrder); + + while(m.find()){ + String headerTitle=Jsoup.parse(m.group(1)).text().trim(); + if (tokens.contains(headerTitle)){ // you've found a matching header + sectStart=m.start(); + if (m.find()) + sectEnd=m.start(); + } + } + + if (sectStart<0) return new ArrayList(); + + // cut just the section we're interested in (the h1 to next start of h1 or end of doc if there is no more sections) + String htmlSubsection=html.substring(sectStart, sectEnd); + + int ulStart=-1; + ulStart=indexOf(htmlSubsection, "<[UL|ul].*>"); + + if (ulStart<0) return new ArrayList(); // exit early if we cant find a list + + int ulEnd=indexOf(htmlSubsection, ""); + + String ul=htmlSubsection.substring(ulStart, ulEnd+"
    ".length());// cut the list section so we can parse it easier + + List result=new ArrayList(); + + Matcher m2=LI_ITERATOR_REGEX.matcher(ul); + while (m2.find()){ + String item=m2.group(1); + String name=""; + String url=null; + if (item.indexOf("=0){ + int hrefStart=item.indexOf("href=")+"href=".length()+1; + url=item.substring(hrefStart, item.indexOf("\"", hrefStart)); + } +// name=Jsoup.parse(item).text().toString() +// .replaceAll(" ", "") +// .trim(); + name=Parser.unescapeEntities(Jsoup.parse(item).text(), false).trim(); + + +// name=truncateBefore(name, 30); +// // check name is not too long, if it is then truncate it +// if (name.length()>30){ +// int to=name.indexOf(" ", 30); // next space after 30 chars +// if (to<0){ +// to=name.length();// if there's no space after 30 chars then just go to the end +// }else +// name=name.substring(0, to)+"..."; +// } + + String id=null; + String description=null; + List tags=null; +// log.debug("Overview ("+o.offering+"):: Adding (Other Materials) document -> ("+url+")"+name); + result.add(new Document(id, name, null, description, url, tags)); + } + + return result; + } + +}