June 02, 2010

How to extract biggest text block from an HTML page ?

One of the interesting problems in handling html content is trying to auto-detect biggest html block from the center of the page. This can be very useful for on-the-fly content analysis done on the browser. Here is an example of how it could be done by parsing the dom after page is rendered.

 

// Royans K Tharakan (2010 June)
// http://www.royans.net/
// You are free in any form to use as long as you give credit where its due
// Would appretiate if you submit your changes/improvement back to me or to some other public forum.
// Requires jquery

var largestId = 0;
var largestDiv = null;
var largestSize = -1;

function getLargestDiv() {
    var size = getSize(document.getElementsByTagName("body")[0], 0);
    if (window.location.href.indexOf("wikipedia.org")>0){
        return "#bodyContent";
    }
    return "[d_id='tmp_" + largestId+"']";
}

function getSize(currentElement, depth) {

    var basesize = 0;
    var actualsize = 0;

    if (currentElement.innerHTML) {
        basesize = currentElement.innerHTML.length;
    }

    if (currentElement.tagName) {
        actualsize = basesize + currentElement.tagName.length * 2 + 5;
    } else {
        actualsize = basesize;
    }

    var attributes = currentElement.attributes;
    if (attributes != null) {
        for ( var j = 0; j < attributes.length; j++) {
            actualsize = actualsize + (attributes[j].name.length);
            actualsize = actualsize + (attributes[j].value.length);
            actualsize = actualsize + 4;
        }
    }

    if (currentElement.childNodes) {
        var i = 0;
        var currentElementChild = currentElement.childNodes[i++];
        while (currentElementChild) {
            var innersize = getSize(currentElementChild, depth + 1);
            if (currentElementChild.innerHTML) {
                basesize = basesize - innersize
                        - currentElementChild.tagName.length * 2 - 5;
            } else {
                basesize = basesize - innersize;
            }
            currentElementChild = currentElement.childNodes[i++];
        }
    }

    if ((largestDiv == null) || (basesize > largestSize)) {
        if ((currentElement.tagName == 'DIV')
                || (currentElement.tagName == 'SPAN')
                || (currentElement.tagName == 'OL')
                || (currentElement.tagName == 'LI')
                || (currentElement.tagName == 'P')
                || (currentElement.tagName == 'A')) {
            largestDiv = currentElement;
            largestSize = basesize;
            largestId++;
            currentElement.setAttribute("d_id", "tmp_" + largestId);
        }
    }
    if ((currentElement.tagName == 'SPAN')
            || (currentElement.tagName == 'OL')
            || (currentElement.tagName == 'LI')
            || (currentElement.tagName == 'P')
            || (currentElement.tagName == 'A')
            || ((currentElement.tagName == 'DIV') && (currentElement.childNodes.length == 0))) {
        return (actualsize - basesize);
    }

    return actualsize;
}

No comments: