How to extract biggest text block from an HTML page ?
One of the interesting problems in handling html content is trying to auto-detect biggest html block from the center of the page. This can be very useful for on-the-fly content analysis done on the browser. Here is an example of how it could be done by parsing the dom after page is rendered.
// Royans K Tharakan (2010 June)
// http://www.royans.net/
// You are free in any form to use as long as you give credit where its due
// Would appretiate if you submit your changes/improvement back to me or to some other public forum.
// Requires jquery
var largestId = 0;
var largestDiv = null;
var largestSize = -1;
function getLargestDiv() {
var size = getSize(document.getElementsByTagName("body")[0], 0);
if (window.location.href.indexOf("wikipedia.org")>0){
return "#bodyContent";
}
return "[d_id='tmp_" + largestId+"']";
}
function getSize(currentElement, depth) {
var basesize = 0;
var actualsize = 0;
if (currentElement.innerHTML) {
basesize = currentElement.innerHTML.length;
}
if (currentElement.tagName) {
actualsize = basesize + currentElement.tagName.length * 2 + 5;
} else {
actualsize = basesize;
}
var attributes = currentElement.attributes;
if (attributes != null) {
for ( var j = 0; j < attributes.length; j++) {
actualsize = actualsize + (attributes[j].name.length);
actualsize = actualsize + (attributes[j].value.length);
actualsize = actualsize + 4;
}
}
if (currentElement.childNodes) {
var i = 0;
var currentElementChild = currentElement.childNodes[i++];
while (currentElementChild) {
var innersize = getSize(currentElementChild, depth + 1);
if (currentElementChild.innerHTML) {
basesize = basesize - innersize
- currentElementChild.tagName.length * 2 - 5;
} else {
basesize = basesize - innersize;
}
currentElementChild = currentElement.childNodes[i++];
}
}
if ((largestDiv == null) || (basesize > largestSize)) {
if ((currentElement.tagName == 'DIV')
|| (currentElement.tagName == 'SPAN')
|| (currentElement.tagName == 'OL')
|| (currentElement.tagName == 'LI')
|| (currentElement.tagName == 'P')
|| (currentElement.tagName == 'A')) {
largestDiv = currentElement;
largestSize = basesize;
largestId++;
currentElement.setAttribute("d_id", "tmp_" + largestId);
}
}
if ((currentElement.tagName == 'SPAN')
|| (currentElement.tagName == 'OL')
|| (currentElement.tagName == 'LI')
|| (currentElement.tagName == 'P')
|| (currentElement.tagName == 'A')
|| ((currentElement.tagName == 'DIV') && (currentElement.childNodes.length == 0))) {
return (actualsize - basesize);
}
return actualsize;
}
Comments