For Developers: 2014

Wednesday, 24 September 2014

SolrJ - Get data from solr

import java.io.IOException;
import java.net.MalformedURLException;

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;

public class SolrAction
{

    public static void main(String args[])

        {


            QueryResponse response = new QueryResponse();

                try {

                        SolrServer server = new HttpSolrServer("http://localhost:8983/solr");
                        ModifiableSolrParams params = new ModifiableSolrParams();


                            params.set("q", "*:*");
                            params.set("rows", "0");
                            response = server.query(params);
                            req.setAttribute("Response", response);
                            System.out.println("Response of Action====>" +response);

                            int totalResults = (int) response.getResults().getNumFound();

                            params = new ModifiableSolrParams();

                            params.set("q", "*:*");

                            params.set("rows", Integer.toString(totalResults));
                            response = server.query(params);

                            System.out.println("totalResults======>" +totalResults);


                            final SolrDocumentList solrDocumentList = response.getResults();

                            req.setAttribute("LIST", solrDocumentList);

                                for (final SolrDocument doc : solrDocumentList) {
                                    String title = (String) doc.getFieldValue("title");
                                    String url = (String) doc.getFieldValue("url");
                                    String content = (String) doc.getFieldValue("content");
                                    try{

                                                System.out.println("Title======>" +title+ "Content=====>" +content+ "Url======>" +url);
                                            }

                                           else{
                                            System.out.println("Systemout in elser");

                                        }


                                    }catch(Exception e){
                                    System.out.println("Exception found in"+e);
                                    }
                                    }



                    }
                catch (SolrServerException e) {
                    // TODO Auto-generated catch block
                    System.out.println("Error..........");
                    e.printStackTrace();
                }

            }

}

Thursday, 18 September 2014

Scrape URLs from google

Program to scrape urls from google and save into the file.

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetURLs {

    public static void main(String[] args) throws Exception {

int i=0;
do{i=i+1;
i++;


//set your query here as you need

        String query = "https://www.google.co.in/webhp?sourceid=chrome-instant&rlz=1C1DFOC_enIN573IN573&ion=1&espv=2&ie=UTF-8#q=javabooks



        URLConnection connection = new URL(query).openConnection();

        connection.setRequestProperty("User-Agent", "Chrome/37.0.2062.94");
        connection.connect();
        URL url = connection.getURL();

        BufferedReader r = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));

        StringBuffer sb = new StringBuffer();
        String line;
       Pattern trimmer = Pattern.compile("\"http(.*?)\">");
                            Matcher m = null;
                            while ((line = r.readLine()) != null) {
                                m = trimmer.matcher(line);
                                    if(m.find()){
                                        break;
                                    }}
                                while ((line = r.readLine()) != null) {
                                    m = trimmer.matcher(line);
                                        if(m.find()){
                                            if(line.indexOf("http")!=-1){
                                                line=line.substring(line.indexOf("http"));
                                                line=line.substring(0,line.indexOf("\">"));
                                                sb.append(line+"\n");

                                            }

                                        }

                                    }
                                    System.out.println(sb.toString());
                                    list.add(sb.toString());
                                    try {
                                        Thread.sleep(5 * 1000);
                                    } catch (InterruptedException e) {
                                        // TODO Auto-generated catch block
                                        e.printStackTrace();
                                    }
                        }while(i<=3);
                                    File file = new File("/Users/bigdata/Desktop/mytest.txt");

                                    // if file doesnt exists, then create it
                                    if (!file.exists()) {
                                    file.createNewFile();
                                    }

                                    FileWriter fw = new FileWriter(file.getAbsoluteFile());
                                    BufferedWriter bw = new BufferedWriter(fw);
                                    for (String link : list) {
                                    bw.write(link+"\n");
                                    }
                                    bw.close();
                    }

Run terminal command using java program

Quickscrape command run using java program-

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class RunShellComandFromJava {

public static void main(String[] args) {

        String command = "/usr/local/bin/quickscrape --urllist /Users/bigdata/Desktop/test.txt --scraperdir /Users/bigdata/Desktop/scrapers/ --output /Users/bigdata/Desktop/my_test4";
        Process proc = null;
        try {

            proc = Runtime.getRuntime().exec(command);
            long start = System.nanoTime();
            System.out.println("Process start... ");


        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        BufferedReader reader =
            new BufferedReader(new InputStreamReader(proc.getInputStream()));

        String line = "";
        try {
            while((line = reader.readLine()) != null) {
                System.out.print(line + "\n");
            }
            long finish = System.nanoTime();
            System.out.println("Process finish... ");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        try {
            proc.waitFor();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

Content mining

Content mining is the mining, extraction and integration of useful data, information and knowledge from Web page content. The heterogeneity and the lack of structure that permits much of the ever-expanding information sources on the World Wide Web, such as hypertext documents, makes automated discovery, organization, and search and indexing tools of the Internet and the World Wide Web such as Lycos, Alta Vista, WebCrawler, ALIWEB [6], MetaCrawler, and others provide some comfort to users, but they do not generally provide structural information nor categorize, filter, or interpret documents. In recent years these factors have prompted researchers to develop more intelligent tools for information retrieval, such as intelligent web agents, as well as to extend database and data mining techniques to provide a higher level of organization for semi-structured data available on the web. The agent-based approach to web mining involves the development of sophisticated AI systems that can act autonomously or semi-autonomously on behalf of a particular user, to discover and organize web-based information.

Web content mining is differentiated from two different points of view Information Retrieval View and Database View. It shows that most of the researches use bag of words, which is based on the statistics about single words in isolation, to represent unstructured text and take single word found in the training corpus as features. For the semi-structured data, all the works utilize the HTML structures inside the documents and some utilized the hyperlink structure between the documents for document representation. As for the database view, in order to have the better information management and querying on the web, the mining always tries to infer the structure of the web site to transform a web site to become a database.

Content mining Tool-

Quickscrape

It is designed to enable large-scale content mining. Scrapers are defined in separate
JSON files that follow a defined structure. This too has important benefits:

No programming required! Non-programmers can make scrapers using a text editor and a web browser with an element inspector (e.g. Chrome).
Large collections of scrapers can be maintained to retrieve similar sets of information from different pages. For example: newspapers or academic journals.
Any other software supporting the same format could use the same scraper definitions.

quickscrape is being developed to allow the community early access to the technology that will drive ContentMine, such as ScraperJSON and our Node.js scraping library thresher.

Installation

The simplest way to install Node.js on OSX is to go to http://nodejs.org/download/,

download and run the Mac OS X Installer.

Alternatively, if you use the excellent Homebrew package manager, simply run:

brew update brew install node

Then you can install quickscrape:

sudo npm install --global --unsafe-perms quickscrape

Run quickscrape --help from the command line to get help:

Usage: quickscrape [options] Options:

-h, --help output usage information

-V, --version output the version number

-u, --url <url> URL to scrape

-r, --urllist <path> path to file with list of URLs to scrape (one per line)

-s, --scraper <path> path to scraper definition (in JSON format)

-d, --scraperdir <path> path to directory containing scraper definitions (in JSON format)

-o, --output <path> where to output results (directory will be created if it doesn't exist

-r, --ratelimit <int> maximum number of scrapes per minute (default 3)

-l, --loglevel <level> amount of information to log (silent, verbose, info*, data, warn, error

Extract data from a single URL with a predefined scraper

First, you'll want to grab some pre-cooked definitions:

git clone https://github.com/ContentMine/journal-scrapers.git

Change in

/usr/local/lib/node_modules/quickscrape/bin/quickscrape.js

in place of

var ScraperBox = thresher.scraperbox;

write

var ScraperBox = thresher.ScraperBox;

Now just run quickscrape:

quickscrape \

--url https://peerj.com/articles/384 \

--scraper journal-scrapers/scrapers/peerj.json \

--output peerj-384

check output directory

peerj-384

we have https_peerj.com_articles_384 folder

open this folder

$ ls

fulltext.xml rendered.html results.json

Scraping a list of URLs

Create a file urls.txt:

http://www.mdpi.com/1420-3049/19/2/2042/htm

http://www.mdpi.com/1420-3049/19/2/2049/htm

http://www.mdpi.com/1420-3049/19/2/2061/htm

http://www.mdpi.com/1420-3049/19/2/2077/htm

http://www.mdpi.com/1420-3049/19/2/2089/htm

we want to extract basic metadata, PDFs, and all figures with captions. We can make a simple ScraperJSON scraper to do that, and save it as molecules_figures.json:

{

"url": "mdpi",

"elements": {

"dc.source": {

"selector": "//meta[@name='dc.source']",

"attribute": "content"

"figure_img": {

"selector": "//div[contains(@id, 'fig')]/div/img",

"attribute": "src",

"download": true

"figure_caption": {

"selector": "//div[contains(@class, 'html-fig_description')]"

"fulltext_pdf": {

"selector": "//meta[@name='citation_pdf_url']",

"attribute": "content",

"download": true

"fulltext_html": {

"selector": "//meta[@name='citation_fulltext_html_url']",

"attribute": "content",

"download": true

"title": {

"selector": "//meta[@name='citation_title']",

"attribute": "content"

"author": {

"selector": "//meta[@name='citation_author']",

"attribute": "content"

"date": {

"selector": "//meta[@name='citation_date']",

"attribute": "content"

"doi": {

"selector": "//meta[@name='citation_doi']",

"attribute": "content"

"volume": {

"selector": "//meta[@name='citation_volume']",

"attribute": "content"

"issue": {

"selector": "//meta[@name='citation_issue']",

"attribute": "content"

"firstpage": {

"selector": "//meta[@name='citation_firstpage']",

"attribute": "content"

"description": {

"selector": "//meta[@name='description']",

"attribute": "content"

}

Now run,

quickscrape --urllist url.txt --scraper journal-scrapers/scrapers/generic_open.json --output my_test

Now you are able to scrape data from any url or list of urls. We have option in quickscrape we can give the scraper directory path in command and scrape different type of urls in single command. like

/quickscrape --urllist url.txt --scraperdir journal-scrapers/scrapers/ --output my_test

Monday, 21 July 2014

Boiler pipe support with nutch, solr and hbase

Using tika we can integrate nutch with boilerpipe because in tika we have boilerpipe jar.

We have apache tika in our nutch directory /apache-nutch-2.2.1/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

In plugin folder we have apache-nutch-2.2.1/runtime/local/plugins/parse-tika/boilerpipe-1.1.0.jar file

Edit nutch-site.xml

<name>tika.boilerpipe</name>

<name>tika.boilerpipe.extractor</name>

<value>ArticleExtractor</value>

</property>

<name>plugin.includes</name>

<description>Regular expression naming plugin directory names to

include. Any plugin not matching this expression is excluded.

In any case you need at least include the nutch-extensionpoints plugin. By

default Nutch includes crawling just HTML and plain text via HTTP,

and basic indexing and search plugins. In order to use HTTPS please enable

protocol-httpclient, but be aware of possible intermittent problems with the

underlying commons-httpclient library.

</description>

</property>

Edit parse-plugin.xml

</mimeType>

</mimeType>

Check url using the following command-

$ bin/nutch parsechecker -dumpText [url] > check.log

for e.g.

$ bin/nutch parsechecker -dumpText http://venturebeat.com/> check.log

It will create dump file which store filter content of url.

And when we crawl and index data using nutch ,solr and hbase

we can check that our hbase table data is filtered. Hbase filtered data -

Sunday, 22 June 2014

Big-Data (Apache-Nutch)

Apache-Nutch-2.2.1 with

Hbase-0.90.4 , Solr-4.8.1

Installation on Mac

About apache nutch:

Apache Nutch is an open source Web crawler written in Java. By using it, we can find Web page hyperlinks in an automated manner, reduce lots of maintenance work, for example checking broken links, and create a copy of all the visited pages for searching over.

Features

• Fetching and parsing are done separately by default, this reduces the risk of an error corrupting the fetch parse stage of a crawl with Nutch.

• Plugins have been overhauled as a direct result of removal of legacy Lucene dependency for indexing and search.

• The number of plugins for processing various document types being shipped with Nutch has been refined. Plain text, XML, OpenDocument (OpenOffice.org), Microsoft Office (Word, Excel, Powerpoint), PDF, RTF, MP3 (ID3 tags) are all now parsed by the Tika plugin. The only parser plugins shipped with Nutch now are Feed (RSS/Atom), HTML, Ext, JavaScript, SWF, Tika & ZIP.

• MapReduce ;

• Distributed filesystem (via Hadoop)

• Link-graph database

NTLM authentication

About apache solr:

Solr^TM is the popular, blazing fast open source enterprise search platform from the Apache Lucene^TMproject. Its major features include powerful full-text search, hit highlighting, faceted search, near real-time indexing, dynamic clustering, database integration, rich document (e.g., Word, PDF) handling, and geospatial search. Solr is highly reliable, scalable and fault tolerant, providing distributed indexing, replication and load-balanced querying, automated failover and recovery, centralized configuration and more. Solr powers the search and navigation features of many of the world's largest.

internet sites.

Solr is written in Java and runs as a standalone full-text search server within a servlet container such as Jetty. Solr uses the Lucene Java search library at its core for full-text indexing and search, and has REST-like HTTP/XML and JSON APIs that make it easy to use from virtually any programming language. Solr's powerful external configuration allows it to be tailored to almost any type of application without Java coding, and it has an extensive plugin architecture when more advanced customization is required.

Features:

• Advanced Full-Text Search Capabilities

• Optimized for High Volume Web Traffic

• Standards Based Open Interfaces - XML, JSON and HTTP

• Comprehensive HTML Administration Interfaces

• Server statistics exposed over JMX for monitoring

• Linearly scalable, auto index replication, auto failover and recovery

• Near Real-time indexing

• Flexible and Adaptable with XML configuration Extensible Plugin Architecture

About apache Hbase:

HBase is an open source, non-relational, distributed database modeled after Google's BigTable and written in Java. It is developed as part of Apache Software Foundation's Apache Hadoop project and runs on top of HDFS (Hadoop Distributed Filesystem), providing BigTable-like capabilities for Hadoop. That is, it provides a fault-tolerant way of storing large quantities of sparse data (small amounts of information caught within a large collection of empty or unimportant data, such as finding the 50 largest items in a group of 2 billion records, or finding the non-zero items representing less than 0.1% of a huge collection). HBase is a type of "NoSQL" database.

Features:

• Linear and modular scalability.

• Strictly consistent reads and writes.

• Automatic and configurable sharding of tables

• Automatic failover support between RegionServers.

• Convenient base classes for backing Hadoop MapReduce jobs with Apache . HBase tables

• Easy to use Java API for client access.

• Block cache and Bloom Filters for real-time queries.

• Query predicate push down via server side Filters

• Thrift gateway and a REST-ful Web service that supports XML, Protobuf, and binary data encoding options

• Extensible jruby-based (JIRB) shell

• Support for exporting metrics via the Hadoop metrics subsystem to files or Ganglia; or via JMX

Installation

Hbase Installation steps:

Download Hbase using

$ wget http://archive.apache.org/dist/hbase/hbase-0.90.4/hbase-0.90.4.tar.gz

Untar file

$ tar -vxf hbase-0.90.4.tar.gz

Change /usr/local/Hbase/conf /hbase-site.xml as below

<name>hbase.rootdir</name>

<value>file:///usr/local/hbase</value>

</property>

<name>hbase.zookeeper.quorum</name>

<value>localhost</value>

</property>

Add JAVA_HOME to /usr/local/Hbase/conf/hbase-env.sh

export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_51.jdk/Contents/Home

export HBASE_OPTS="-Djava.security.krb5.realm= -Djava.security.krb5.kdc="

Start Hbase-

$ ./bin/start-hbase.sh

Check if Hbase install correctly

$ ./bin/hbase shell

HBase Shell; enter 'help<RETURN>' for list of supported commands.

Type "exit<RETURN>" to leave the HBase Shell

Version 0.90.4, r1150278, Sun Jul 24 15:53:29 PDT 2011

Create table

hbase(main):001:0> create 'test', 'cf'

0 row(s) in 0.4340 seconds

Check table

hbase(main):002:0> list 'test'

TABLE

test

1 row(s) in 0.0580 seconds

Put data on table

hbase(main):003:0> put 'test', 'row1', 'cf:a', 'value1'

0 row(s) in 0.2130 seconds

hbase(main):004:0> put 'test', 'row2', 'cf:b', 'value2'

0 row(s) in 0.0140 seconds

hbase(main):005:0> put 'test', 'row3', 'cf:c', 'value3'

0 row(s) in 0.0130 seconds

Check records of table

hbase(main):006:0> scan 'test'

ROW COLUMN+CELL

row1 column=cf:a, timestamp=1403154436134, value=value1

row2 column=cf:b, timestamp=1403154448918, value=value2

row3 column=cf:c, timestamp=1403154456718, value=value3

3 row(s) in 0.0910 seconds

hbase(main):008:0> exit

Stop Hbase-

$ ./bin/stop-hbase.sh

Apache Nutch Installation steps:

Download apache-nutch-2.2.1 from

http://www.apache.org/dyn/closer.cgi/nutch/

Extract apache-nutch-2.2.1-src.tar.gz file

Move downloaded file to the directory-

$ mv apache-nutch-2.2.1 /usr/local/

Edit usr/local/apache-nutch-2.2.1/conf/nutch-site.xml file

<name>storage.data.store.class</name>

<value>org.apache.gora.hbase.store.HBaseStore</value>

<description>Default class for storing data</description>

</property>

<name>http.agent.name</name>

<value>NutchCrawler</value>

</property>

<name>http.robots.agents</name>

<value>NutchCrawler,*</value>

</property>

</configuration>

Edit usr/local/apache-nutch-2.2.1/conf/hbase-site.xml file

<name>hbase.rootdir</name>

<value>file:///usr/local/hbase</value>

</property>

<name>hbase.zookeeper.quorum</name>

<value>localhost</value>

</property>

<name>hbase.zookeeper.property.clientPort</name>

</property>

</configuration>

Edit usr/local/apache-nutch-2.2.1/conf/gora.properties

gora.datastore.default=org.apache.gora.hbase.store.HBaseStore

Uncomment the /usr/local/apache-nutch-2.2.1/ivy/ivy.xml

<dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />

Edit usr/local/apache-nutch-2.2.1/conf/regex-urlfiter.txt

+^http://work-at-google.com

Run ant

$ ant clean

$ ant runtime

It will create runtime folder in apache-nutch-2.2.1 folder

usr/local/apache-nutch-2.2.1/runtime

Create directory /usr/local/apache-nutch-2.2.1/runtime/local

$ mkdir urls

$ echo "http://work-at-google.com" > urls/seed.txt

Set path for Java_Home

$ export JAVA_HOME="$(/usr/libexec/java_home)"

Crawl with Nutch

$ bin/nutch inject urls

$ bin/nutch generate -topN 5

$ bin/nutch fetch -all

$ bin/nutch parse -all

$ bin/nutch updated

$ bin/nutch readdb

After running these steps it will create webpage folder in hbase which is table of hbase and store all crawl, fetch data.

Apache Solr Installation steps:

Download Solr

$ brew install solr

Start Solr

$ cd usr/local/Cellar/solr/4.8.1/libexec/example/

$ java -jar start.jar

Solr is running we can check.

http://localhost:8983/solr/admin/

Now feed the data solr with nutch

$ bin/nutch solrindex http://localhost:8983/solr/ -all

Using the crawl script

$ bin/crawl urls/seed.txt testCrawl localhost:8983/solr/ 2

After running this command it will create testCrawl_webpage folder in hbase which is table of hbase and store all data.

Now we can search over data in Solr

http://localhost:8983/solr/#/collection1/query

Apache Nutch 2.x Commands:

$ bin/nutch readdb

(Read/dump crawl db)

Usage: WebTableReader (-stats | -url [url] | -dump <out_dir> [-regex regex])

[-crawlId <id>] [-content] [-headers] [-links] [-text]

-crawlId <id> - the id to prefix the schemas to operate on,

(default: storage.crawl.id)

-stats [-sort] - print overall statistics to System.out

[-sort] - list status sorted by host

-url <url> - print information on <url> to System.out

-dump <out_dir> [-regex regex] - dump the webtable to a text file in

<out_dir>

-content - dump also raw content

-headers - dump protocol headers

-links - dump links

-text - dump extracted text

[-regex] - filter on the URL of the webtable entry

$ bin/nutch inject

(Inject new urls into the database)

Usage: InjectorJob <url_dir> [-crawlId <id>]

$ bin/nutch hostinject

(Inject new urls into the hostdatabase)

$ bin/nutch generate

(Generate new segments to fetch from crawldb)

Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]

-topN <N> - number of top URLs to be selected, default is Long.MAX_VALUE

-crawlId <id> - the id to prefix the schemas to operate on,

(default: storage.crawl.id)");

-noFilter - do not activate the filter plugin to filter the url, default is true

-noNorm - do not activate the normalizer plugin to normalize the url, default is true

-adddays - Adds numDays to the current time to facilitate crawling urls already fetched sooner then db.default.fetch.interval. Default value is 0.

$ bin/nutch fetch

(Fetch a segment's pages)

Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] [-threads N] [-resume] [-numTasks N]

<batchId> - crawl identifier returned by Generator, or -all for all

generated batchId-s

-crawlId <id> - the id to prefix the schemas to operate on,

(default: storage.crawl.id)

-threads N - number of fetching threads per task

-resume - resume interrupted job

-numTasks N - if N > 0 then use this many reduce tasks for fetching

(default: mapred.map.tasks)

$ bin/nutch parse

(Parse a segment's pages)

Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]

<batchId> - symbolic batch ID created by Generator

-crawlId <id> - the id to prefix the schemas to operate on,

(default: storage.crawl.id)

-all - consider pages from all crawl jobs

-resume - resume a previous incomplete job

-force - force re-parsing even if a page is already parsed

$ bin/nutch updatedb

(Update crawldb after fetching)

$ bin/nutch updatehostdb

(Update hostdb after fetching)

$ bin/nutch elasticindex

(Run the elastic search indexer on parsed batches)

$ bin/nutch solrindex

(Run the solr indexer on parsed segments and linkdb)

Usage: SolrIndexerJob <solr url> (<batchId> | -all | -reindex) [-crawlId <id>]

$ bin/nutch parsechecker

(Checks the parser for a given url)

$ bin/nutch plugin

(Loads a plugin and run one of its classes main())

$ bin/nutch NutchServer

(run a (local) Nutch server on a user defined port)

usage: NutchServer [-help] [-log <loging level>] [-port] [-stop <force>]

-help Show this help

-log <loging level> Select a logging level for the

|WARNING

-port Use port for restful API

-stop <force> Stop running nutch server. Force stops server

despite running jobs

$ bin/nutch junit

(Runs the given JUnit test)

$ bin/nutch CLASSNAME

(run the class named CLASSNAME)