jSoup - remove particular style attributes - java

Am I missing something? Is there a better way to do this?
INPUT:
<span style="FONT-FAMILY: 'Lucida Sans','sans-serif'; COLOR: #003572; FONT-SIZE: 9pt;
mso-fareast-font-family: Calibri; mso-ansi-language: EN-US; mso-fareast-language: EN-US;
mso-bidi-language: AR-SA; mso-fareast-theme-font: minor-latin">Dr. Who is
<u>usually</u> available for consultations Mon - Thurs afternoons and Friday 9a-
12p at 555-1212. </span>
DESIRED OUTPUT:
<span style="COLOR: #003572; FONT-SIZE: 9pt;">Dr. Who is
<u>usually</u> available for consultations Mon - Thurs
afternoons and Friday 9a-12p at 555-1212. </span>
MY CODE SO FAR:
//cleans the HTML within the Week Long note before writing to the DB
Whitelist wl = new Whitelist();
wl = Whitelist.simpleText();
wl.addTags("br");
wl.addTags("p");
wl.addTags("span");
wl.addAttributes(":all","style");
Document doc =
Jsoup.parse(
"<html><head></head><body>"+ds.getWeeklongNote()+"</body></html>");
Elements e = doc.select("*");
for (Element el : e){
for (Attribute attr : el.attributes()){
if (attr.getKey().equals("span")){
String newValue = "";
String s = attr.getValue();
String[] values = s.split(";");
for (String value : values){
if (value.startsWith("COLOR")||value.startsWith("FONT-SIZE")){
newValue += attr.getKey()+"="+attr.getValue()+";";
}
}
attr.setValue(newValue);
}
}
}
doc.html(e.outerHtml());
ds.setWeekLongNote(Jsoup.clean(doc.body().outerHtml(), wl));

Try this:
Document doc = Jsoup.parse(html);
Elements e = doc.getElementsByTag("body");
Log.i("Span element: "+e.get(0).nodeName(), ""+e.get(0).nodeName());
e = e.get(0).getElementsByTag("span");
Attributes styleAtt = e.get(0).attributes();
Attribute a = styleAtt.asList().get(0);
if(a.getKey().equals("style")){
String[] items = a.getValue().trim().split(";");
String newValue = "";
for(String item: items){
if(item.contains("COLOR:")||item.contains("FONT-SIZE:")){
Log.i("Style Item: ", ""+item);
newValue = newValue.concat(item).concat(";");
}
}
a.setValue(newValue);
Log.i("New Atrrbute: ",""+newValue);
}
Log.i("FINAL HTML: ",""+e.outerHtml());
doc.html(e.outerHtml());
}
Output:
08-17 18:28:07.692: I/FINAL HTML:(8148): <span style=" COLOR: #003572; FONT-SIZE: 9pt;">Dr. Who is <u>usually</u> available for consultations Mon - Thurs afternoons and Friday 9a- 12p at 555-1212. </span>
Cheers,

If you have more than one span element you can use this code snippet:
Document document = Jsoup.parse(html);
Vector<String> allowedItems = new Vector<String>();
allowedItems.add("color");
allowedItems.add("font-size");
Elements e = document.getElementsByTag("span");
for (Element element : e) {
String[] styles = element.attr("style").split(";");
Vector<String> filteredItems = new Vector<String>();
for (String item : styles) {
String key = (item.split(":"))[0].trim().toLowerCase();
if ( allowedItems.contains(key) ){
filteredItems.add(item);
}
}
if( filteredItems.size() == 0 ){
element.removeAttr("style");
}else{
element.attr("style",StringUtils.join(filteredItems, ";"));
}
}

//remove style attribute
Elements elms = doc.select("*").not("img");
for (Element e : elms) {
String attr = e.attr("style");
if(!"".equals(attr) || null!=attr){
e.attr("style", "");
}
}

Related

Jsoup casting Element as TextNode causes exception

what am trying to parse using jsoup is the following Numéro d'arrêt : 5216 and Numéro NOR : 63066, but nothing seems to works any advice and suggestions will be greatly appreciated:
`
<div class="1st">
<p>
<h3>Numérotation : </h3>
Numéro d'arrêt : 5216
<br />
Numéro NOR : 63066
</div>
`
UPDATE :
i got this code to work but it keep giving me this exception org.jsoup.nodes.Element cannot be cast to org.jsoup.nodes.TextNode :
Document tunisie = Jsoup.connect("http://www.juricaf.org/arret/TUNISIE-COURDECASSATION-20060126-5216").get();
for (Element titres : tunisie.select("div.arret")){
String titre = titres.select("h1#titre").first().text();
System.out.println(titre);
System.out.println("\n");
}
for (Element node : tunisie.select("h3")) {
TextNode numérodarrêt = (TextNode) node.nextSibling();
System.out.println(" " + numérodarrêt.text());
System.out.println("\n");
}
//NuméroNOR et Identifiant URN LEX
for (Element element2 : tunisie.select("br")) {
TextNode NuméroNOR_IdentifiantURNLEX = (TextNode) element2.nextSibling();
System.out.println(" " + NuméroNOR_IdentifiantURNLEX.text());
System.out.println("\n");
}
UPDATE :
here is what am trying to parse image link is below.
Parsing text outside html tags
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupTest {
public static void main(String[] args) throws IOException {
Document tunisie = Jsoup.connect("http://www.juricaf.org/arret/TUNISIE-COURDECASSATION-20060126-5216").get();
// get the first div in class arret
Element arret = tunisie.select("div.arret").first();
// select h1 tag by its ID to get the title
String titre = arret.select("#titre").text();
System.out.println(titre);
// to get the text after h3 select h3 and go to next sibling
String txtAfterFirstH3 = arret.select("h3").first().nextSibling().toString();
System.out.println(txtAfterFirstH3);
// select first br by its index; note first br has the index 0; and call nextSibling to get the text after the br tag
String txtAfterFirstBr = arret.getElementsByTag("br").get(0).nextSibling().toString();
System.out.println(txtAfterFirstBr);
// the same as above only with next index
String txtAfterSecondBr = arret.getElementsByTag("br").get(1).nextSibling().toString();
System.out.println(txtAfterSecondBr);
}
}
String html = "<div class=\"1st\">\n" +
"<p>\n" +
"<h3>Numérotation : </h3>\n" +
"Numéro d'arrêt : 5216\n" +
"<br />\n" +
"Numéro NOR : 63066\n" +
"</div>";
Document doc = Jsoup.parse(html);
Elements divs = doc.select("div.1st");
for(Element e : divs){
System.out.println(e.ownText());
}

How to properly print nested HTML lists using iText? [duplicate]

I have XHTML content, and I have to create from this content a PDF file on the fly. I use iText pdf converter.
I tried the simple way, but I always get bad result after calling the XMLWorkerHelper parser.
XHTML:
<ul>
<li>First
<ol>
<li>Second</li>
<li>Second</li>
</ol>
</li>
<li>First</li>
</ul>
The expected value:
First
Second
Second
First
PDF result:
First Second Second
First
In the result there is no nested list. I need a solution for calling the parser, and not creating an iText Document instance.
Please take a look at the example NestedListHtml
In this example, I take your code snippet list.html:
<ul>
<li>First
<ol>
<li>Second</li>
<li>Second</li>
</ol>
</li>
<li>First</li>
</ul>
And I parse it into an ElementList:
// CSS
CSSResolver cssResolver =
XMLWorkerHelper.getInstance().getDefaultCssResolver(true);
// HTML
HtmlPipelineContext htmlContext = new HtmlPipelineContext(null);
htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
htmlContext.autoBookmark(false);
// Pipelines
ElementList elements = new ElementList();
ElementHandlerPipeline end = new ElementHandlerPipeline(elements, null);
HtmlPipeline html = new HtmlPipeline(htmlContext, end);
CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
// XML Worker
XMLWorker worker = new XMLWorker(css, true);
XMLParser p = new XMLParser(worker);
p.parse(new FileInputStream(HTML));
Now I can add this list to the Document:
for (Element e : elements) {
document.add(e);
}
Or I can list this list to a Paragraph:
Paragraph para = new Paragraph();
for (Element e : elements) {
para.add(e);
}
document.add(para);
You will get the desired result as shown in nested_list.pdf
You can not add nested lists to a PdfPCell or to a ColumnText. For instance: this will not work:
PdfPTable table = new PdfPTable(2);
table.addCell("Nested lists don't work in a cell");
PdfPCell cell = new PdfPCell();
for (Element e : elements) {
cell.addElement(e);
}
table.addCell(cell);
document.add(table);
This is due to a limitation in the ColumnText class that has been there for many years. We have evaluated the problem and the only way to fix this, would be to rewrite ColumnText entirely. This is not an item on our current technical road map.
Here's a workaround for nested ordered and un-ordered lists.
The rich Text editor I am using giving the class attribute "ql-indent-1/2/2/" for li tags, based on the attribute adding ul/ol starting and ending tags.
public String replaceIndentSubList(String htmlContent) {
org.jsoup.nodes.Document document = Jsoup.parseBodyFragment(htmlContent);
Elements element_UL = document.select("ul");
Elements element_OL = document.select("ol");
if (!element_UL.isEmpty()) {
htmlContent = replaceIndents(htmlContent, element_UL, "ul");
}
if (!element_OL.isEmpty()) {
htmlContent = replaceIndents(htmlContent, element_OL, "ol");
}
return htmlContent;
}
public String replaceIndents(String htmlContent, Elements element, String tagType) {
String attributeKey = "class";
String startingULTgas = "<" + tagType + ">";
String endingULTags = "</" + tagType + ">";
int lengthOfQLIndenet = new String("ql-indent-").length();
HashMap<String, String> startingLiTagMap = new HashMap<String, String>();
HashMap<String, String> lastLiTagMap = new HashMap<String, String>();
Pattern regex = Pattern.compile("ql-indent-\\d");
HashSet<String> hash_Set = new HashSet<String>();
Elements element_Tag = element.select("li");
for (org.jsoup.nodes.Element element2 : element_Tag) {
org.jsoup.nodes.Attributes att = element2.attributes();
if (att.hasKey(attributeKey)) {
String attributeValue = att.get(attributeKey);
Matcher matcher = regex.matcher(attributeValue);
if (matcher.find()) {
if (!startingLiTagMap.containsKey(attributeValue)) {
startingLiTagMap.put(attributeValue, element2.toString());
}
hash_Set.add(matcher.group(0));
if (!startingLiTagMap.get(attributeValue)
.equalsIgnoreCase(element2.toString())) {
lastLiTagMap.put(attributeValue, element2.toString());
}
}
}
}
System.out.println(htmlContent);
Iterator value = hash_Set.iterator();
while (value.hasNext()) {
String liAttributeKey = (String) value.next();
int noOfIndentes = Integer
.parseInt(liAttributeKey.substring(lengthOfQLIndenet));
if (noOfIndentes > 1)
for (int i = 1; i < noOfIndentes; i++) {
startingULTgas = startingULTgas + "<" + tagType + ">";
endingULTags = endingULTags + "</" + tagType + ">";
}
htmlContent = htmlContent.replace(startingLiTagMap.get(liAttributeKey),
startingULTgas + startingLiTagMap.get(liAttributeKey));
if (lastLiTagMap.get(liAttributeKey) != null) {
System.out.println("Inside last Li Map");
htmlContent = htmlContent.replace(lastLiTagMap.get(liAttributeKey),
lastLiTagMap.get(liAttributeKey) + endingULTags);
}
else {
htmlContent = htmlContent.replace(startingLiTagMap.get(liAttributeKey),
startingLiTagMap.get(liAttributeKey) + endingULTags);
}
startingULTgas = "<" + tagType + ">";
endingULTags = "</" + tagType + ">";
}
System.out.println(htmlContent);[enter image description here][1]
return htmlContent;
}

How to generate XPath query matching a specific element in Jsoup?

_ Hi , this is my web page :
<html>
<head>
</head>
<body>
<div> text div 1</div>
<div>
<span>text of first span </span>
<span>text of second span </span>
</div>
<div> text div 3 </div>
</body>
</html>
I'm using jsoup to parse it , and then browse all elements inside the page and get their paths :
Document doc = Jsoup.parse(new File("C:\\Users\\HC\\Desktop\\dataset\\index.html"), "UTF-8");
Elements elements = doc.body().select("*");
ArrayList all = new ArrayList();
for (Element element : elements) {
if (!element.ownText().isEmpty()) {
StringBuilder path = new StringBuilder(element.nodeName());
String value = element.ownText();
Elements p_el = element.parents();
for (Element el : p_el) {
path.insert(0, el.nodeName() + '/');
}
all.add(path + " = " + value + "\n");
System.out.println(path +" = "+ value);
}
}
return all;
my code give me this result :
html/body/div = text div 1
html/body/div/span = text of first span
html/body/div/span = text of second span
html/body/div = text div 3
in fact i want get result like this :
html/body/div[1] = text div 1
html/body/div[2]/span[1] = text of first span
html/body/div[2]/span[2] = text of second span
html/body/div[3] = text div 3
please could any one give me idea how to get reach this result :) . thanks in advance.
As asked here a idea.
Even if I'm quite sure that there better solutions to get the xpath for a given node. For example use xslt as in the answer to "Generate/get xpath from XML node java".
Here the possible solution based on your current attempt.
For each (parent) element check if there are more than one element with this name.
Pseudo code: if ( count (el.select('../' + el.nodeName() ) > 1)
If true count the preceding-sibling:: with same name and add 1.
count (el.select('preceding-sibling::' + el.nodeName() ) +1
This is my solution to this problem:
StringBuilder absPath=new StringBuilder();
Elements parents = htmlElement.parents();
for (int j = parents.size()-1; j >= 0; j--) {
Element element = parents.get(j);
absPath.append("/");
absPath.append(element.tagName());
absPath.append("[");
absPath.append(element.siblingIndex());
absPath.append("]");
}
This would be easier, if you traversed the document from the root to the leafs instead of the other way round. This way you can easily group the elements by tag-name and handle multiple occurences accordingly. Here is a recursive approach:
private final List<String> path = new ArrayList<>();
private final List<String> all = new ArrayList<>();
public List<String> getAll() {
return Collections.unmodifiableList(all);
}
public void parse(Document doc) {
path.clear();
all.clear();
parse(doc.children());
}
private void parse(List<Element> elements) {
if (elements.isEmpty()) {
return;
}
Map<String, List<Element>> grouped = elements.stream().collect(Collectors.groupingBy(Element::tagName));
for (Map.Entry<String, List<Element>> entry : grouped.entrySet()) {
List<Element> list = entry.getValue();
String key = entry.getKey();
if (list.size() > 1) {
int index = 1;
// use paths with index
key += "[";
for (Element e : list) {
path.add(key + (index++) + "]");
handleElement(e);
path.remove(path.size() - 1);
}
} else {
// use paths without index
path.add(key);
handleElement(list.get(0));
path.remove(path.size() - 1);
}
}
}
private void handleElement(Element e) {
String value = e.ownText();
if (!value.isEmpty()) {
// add entry
all.add(path.stream().collect(Collectors.joining("/")) + " = " + value);
}
// process children of element
parse(e.children());
}
Here is the solution in Kotlin. It's correct, and it works. The other answers are wrong and caused me hours of lost work.
fun Element.xpath(): String = buildString {
val parents = parents()
for (j in (parents.size - 1) downTo 0) {
val parent = parents[j]
append("/*[")
append(parent.siblingIndex() + 1)
append(']')
}
append("/*[")
append(siblingIndex() + 1)
append(']')
}

Extracting content from html tags using java

I extracted data from an html page and then parsed the tags containing tags like this now I tried different ways like extracting substring etc do extract only the title and href tags. but it'snot working..Can anyone help me. This is the small snippet of my output
my code
doc = Jsoup.connect("myurl").get();
Elements link = doc.select("a[href]");
String stringLink = null;
for (int i = 0; i < link.size(); i++)
{
stringLink = link.toString();
System.out.println(stringLink);
}
output
<a class="link" title="Waf Ad" href="https://www.facebook.com/waf.ad.54"
data- jsid="anchor" target="_blank"><img class="_s0 _rw img" src="https:
//fbcdn-profile-a.akamaihd.net/hprofile-ak-ash1/t5/186729_100007938933785_
508764241_q.jpg" alt="Waf Ad" data-jsid="img" /></a>
<a class="link" title="Ana Ga" href="https://www.facebook.com/ata.ga.31392410"
data-jsid="anchor" target="_blank"><img class="_s0 _rw img" src="https://
fbcdn-profile-a.akamaihd.net/hprofile-ak-ash1/t5/186901_100002334679352_
162381693_q.jpg" alt="Ana Ga" data-jsid="img" /></a>
You can use the attr() method of Element class to extract the value of attributes.
For example:
String href = link.attr("href");
String title = link.attr("title");
See this page for more: Extract attributes, text, and HTML from elements
To get the page title, you can use
Document doc = Jsoup.connect("myurl").get();
String title = doc.title();
For getting the individual links from the different hrefs, you can use this
Elements links = doc.select("a[href]");
for(Element ele : links) {
System.out.println(ele.attr("href").toString());
}
attr() method gives the content inside the matching attributed spedified to it in the given tag.
public class Solution{
public static void main(String[] args){
Scanner scan = new Scanner(System.in);
int testCases = Integer.parseInt(scan.nextLine());
while (testCases-- > 0) {
String line = scan.nextLine();
boolean matchFound = false;
Pattern r = Pattern.compile("<(.+)>([^<]+)</\\1>");
Matcher m = r.matcher(line);
while (m.find()) {
System.out.println(m.group(2));
matchFound = true;
}
if ( ! matchFound) {
System.out.println("None");
}
}
}
}
REGULAR EXPRESSION EXPLAINATION:

JSoup storing text in a variable

i'm new in JAVA / Android development.
I made an app to extract text from a HTML class;
protected List<String> doInBackground(String... url) {
try {
Document doc = Jsoup.connect(
"http://example/test.html").get();
Elements st1 = doc.select("a[class*=subject_rating_details");
for (Element element : st1) {
sgrade[0] = st1.get(0).text();
sgrade[1] = st1.get(0).text();
sgrade[2] = st1.get(0).text();
sgrade[3] = st1.get(0).text();
sgrade[4] = st1.get(0).text();
}
} catch (IOException e) {
e.printStackTrace();
}
List<String> pinfo = null;
return pinfo;
}
#Override
protected void onPostExecute(List<String> pinfo) {
prog.dismiss();
}
}
List<ListData> varlist = new ArrayList<ListData>();
String sgrade[] = new String[] {};
I used JSoup to extract from my webpage different text from the HTML class="subject_rating_details".
But it force closes with the code above.
I can successfully extract it with a single String, example:
for (Element element : st1) {
stringname = st1.get(0).text();
stringname = st1.get(1).text();
stringname = st1.get(2).text();
stringname = st1.get(3).text();
stringname = st1.get(4).text();
}
But it only stores the last one ( stringname = st1.get(4).text(); )
I've tried also:
for (Element element : st1) {
stringname1 = st1.get(0).text();
stringname2 = st1.get(1).text();
stringname3 = st1.get(2).text();
stringname4 = st1.get(3).text();
stringname5 = st1.get(4).text();
}
But i need the text from st1 in a single variable.
What can i do?
Thanks
EDIT
I want something like this:
String sgrade[] = new String[] {};
for (Element element : st1) {
sgrade[0] = st1.get(0).text();
sgrade[1] = st1.get(0).text();
sgrade[2] = st1.get(0).text();
sgrade[3] = st1.get(0).text();
sgrade[4] = st1.get(0).text();
}
Witch later i could read each text and display it in a TextView:
textview1.setText(sgrade[0]); <--/// This would display "Ford"
textview2.setText(sgrade[1]); <--/// This would display "Mustang"
textview3.setText(sgrade[2]); <--/// This would display "2013"
/// HTML ///
...
<p class="subject_rating_details">Ford</p>
<p class="subject_rating_details">Mustang</p>
<p class="subject_rating_details">2013</p>
...
/// HTML ///
Please try this way. With this you will get value of st1 in single string named stringname.
List<String> stringname =new ArrayList<String>();
for (Element element : st1) {
stringname.add(st1.get(0).text());
stringname.add(st1.get(1).text());
stringname.add(st1.get(2).text());
stringname.add(st1.get(3).text());
stringname.add(st1.get(4).text());
}

Categories