Wednesday, 24 September 2014

SolrJ - Get data from solr

import java.io.IOException;
import java.net.MalformedURLException;



import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;



public class SolrAction
{
   
    public static void main(String args[])
         
        {
   
         
            QueryResponse response = new QueryResponse();
       
                try {
           
                        SolrServer server = new HttpSolrServer("http://localhost:8983/solr");
                        ModifiableSolrParams params = new ModifiableSolrParams();
            
             
                            params.set("q", "*:*");
                            params.set("rows", "0");
                            response = server.query(params);
                            req.setAttribute("Response", response);
                            System.out.println("Response of Action====>" +response);
           
                            int totalResults = (int) response.getResults().getNumFound();
           
                            params = new ModifiableSolrParams();
            
                            params.set("q", "*:*");
             
                            params.set("rows", Integer.toString(totalResults));
                            response = server.query(params);
           
                            System.out.println("totalResults======>" +totalResults);
           
           
                            final SolrDocumentList solrDocumentList = response.getResults();
           
                            req.setAttribute("LIST", solrDocumentList);
        
                                for (final SolrDocument doc : solrDocumentList) {
                                    String title = (String) doc.getFieldValue("title");
                                    String url = (String) doc.getFieldValue("url");
                                    String content = (String) doc.getFieldValue("content");
                                    try{
                                       
                                                 System.out.println("Title======>" +title+ "Content=====>" +content+ "Url======>" +url);
                                             }
                                             
                                           else{
                                              System.out.println("Systemout in elser");     
                                             
                                          }
                                         
                                         
                                      }catch(Exception e){
                                    System.out.println("Exception found in"+e);     
                                      }
                                    }
         
                               
           
                    }
                catch (SolrServerException e) {
                    // TODO Auto-generated catch block
                    System.out.println("Error..........");
                    e.printStackTrace();
                }
   
            }
   
}

Thursday, 18 September 2014

Scrape URLs from google

Program to scrape urls from google and save into the file.

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class GetURLs {
   
    public static void main(String[] args) throws Exception {
       
        int i=0;
        do{i=i+1;
        i++;
       
       
       //set your query here as you need

        String query = "https://www.google.co.in/webhp?sourceid=chrome-instant&rlz=1C1DFOC_enIN573IN573&ion=1&espv=2&ie=UTF-8#q=javabooks
       
      
       
        URLConnection connection = new URL(query).openConnection();
       
        connection.setRequestProperty("User-Agent", "Chrome/37.0.2062.94");
        connection.connect();
        URL url = connection.getURL();
         
        BufferedReader r  = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));

        StringBuffer sb = new StringBuffer();
        String line;
       Pattern trimmer = Pattern.compile("\"http(.*?)\">");
                            Matcher m = null;
                            while ((line = r.readLine()) != null) {
                                m = trimmer.matcher(line);
                                    if(m.find()){
                                        break;
                                    }}
                                while ((line = r.readLine()) != null) {
                                    m = trimmer.matcher(line);
                                        if(m.find()){
                                            if(line.indexOf("http")!=-1){
                                                line=line.substring(line.indexOf("http"));
                                                line=line.substring(0,line.indexOf("\">"));
                                                sb.append(line+"\n");
                                               
                                            }
           
                                        }
       
                                    }
                                    System.out.println(sb.toString());
                                    list.add(sb.toString());
                                    try {
                                        Thread.sleep(5 * 1000);
                                    } catch (InterruptedException e) {
                                        // TODO Auto-generated catch block
                                        e.printStackTrace();
                                    }
                        }while(i<=3);
                                    File file = new File("/Users/bigdata/Desktop/mytest.txt");

                                    // if file doesnt exists, then create it
                                    if (!file.exists()) {
                                    file.createNewFile();
                                    }

                                    FileWriter fw = new FileWriter(file.getAbsoluteFile());
                                    BufferedWriter bw = new BufferedWriter(fw);
                                    for (String link : list) {
                                    bw.write(link+"\n");
                                    }
                                    bw.close();
                    }

Run terminal command using java program


Quickscrape command run using java program-

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;


public class RunShellComandFromJava {
  
      public static void main(String[] args) {

     
        String command = "/usr/local/bin/quickscrape --urllist /Users/bigdata/Desktop/test.txt --scraperdir /Users/bigdata/Desktop/scrapers/ --output /Users/bigdata/Desktop/my_test4";
        Process proc = null;
        try {
           
            proc = Runtime.getRuntime().exec(command);
            long start = System.nanoTime();
            System.out.println("Process start... ");
           
           
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        BufferedReader reader =
            new BufferedReader(new InputStreamReader(proc.getInputStream()));

        String line = "";
        try {
            while((line = reader.readLine()) != null) {
                System.out.print(line + "\n");
            }
            long finish = System.nanoTime();
            System.out.println("Process finish... ");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        try {
            proc.waitFor();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }  
    }
}

Content mining


Content mining is the mining, extraction and integration of useful data, information and knowledge from Web page content. The heterogeneity and the lack of structure that permits much of the ever-expanding information sources on the World Wide Web, such as hypertext documents, makes automated discovery, organization, and search and indexing tools of the Internet and the World Wide Web such as Lycos, Alta Vista, WebCrawler, ALIWEB [6], MetaCrawler, and others provide some comfort to users, but they do not generally provide structural information nor categorize, filter, or interpret documents. In recent years these factors have prompted researchers to develop more intelligent tools for information retrieval, such as intelligent web agents, as well as to extend database and data mining techniques to provide a higher level of organization for semi-structured data available on the web. The agent-based approach to web mining involves the development of sophisticated AI systems that can act autonomously or semi-autonomously on behalf of a particular user, to discover and organize web-based information.
Web content mining is differentiated from two different points of view Information Retrieval View and Database View. It shows that most of the researches use bag of words, which is based on the statistics about single words in isolation, to represent unstructured text and take single word found in the training corpus as features. For the semi-structured data, all the works utilize the HTML structures inside the documents and some utilized the hyperlink structure between the documents for document representation. As for the database view, in order to have the better information management and querying on the web, the mining always tries to infer the structure of the web site to transform a web site to become a database.


Content mining Tool-

Quickscrape

It is designed to enable large-scale content mining. Scrapers are defined in separate 
JSON files that follow a defined structure. This too has important benefits:
  • No programming required! Non-programmers can make scrapers using a text editor and a web browser with an element inspector (e.g. Chrome).
  • Large collections of scrapers can be maintained to retrieve similar sets of information from different pages. For example: newspapers or academic journals.
  • Any other software supporting the same format could use the same scraper definitions.
quickscrape is being developed to allow the community early access to the technology that will drive ContentMine, such as ScraperJSON and our Node.js scraping library thresher.

 

Installation

The simplest way to install Node.js on OSX is to go to http://nodejs.org/download/

download and run the Mac OS X Installer.

Alternatively, if you use the excellent Homebrew package manager, simply run:
brew update brew install node 
Then you can install quickscrape:
sudo npm install --global --unsafe-perms quickscrape 
 
 
Run quickscrape --help from the command line to get help:
   Usage: quickscrape [options]    Options:    
 -h, --help               output usage information   
 -V, --version            output the version number  
  -u, --url <url>          URL to scrape    
-r, --urllist <path>     path to file with list of URLs to scrape (one per line)   
 -s, --scraper <path>     path to scraper definition (in JSON format)   
 -d, --scraperdir <path>  path to directory containing scraper definitions (in JSON format)    
-o, --output <path>      where to output results (directory will be created if it doesn't exist    
-r, --ratelimit <int>    maximum number of scrapes per minute (default 3)   
 -l, --loglevel <level>   amount of information to log (silent, verbose, info*, data, warn, error


Extract data from a single URL with a predefined scraper


First, you'll want to grab some pre-cooked definitions:
git clone https://github.com/ContentMine/journal-scrapers.git
Change in
/usr/local/lib/node_modules/quickscrape/bin/quickscrape.js
in place of
var ScraperBox = thresher.scraperbox;
write
var ScraperBox = thresher.ScraperBox; 

Now just run quickscrape:
quickscrape \  
--url https://peerj.com/articles/384 \  
--scraper journal-scrapers/scrapers/peerj.json \  

 
--output peerj-384

check output directory

peerj-384

we have https_peerj.com_articles_384 folder

open this folder
$ ls

fulltext.xml  rendered.html        results.json

Scraping a list of URLs

 Create a file urls.txt:

http://www.mdpi.com/1420-3049/19/2/2042/htm

we want to extract basic metadata, PDFs, and all figures with captions. We can make a simple ScraperJSON scraper to do that, and save it as molecules_figures.json:

{
  "url": "mdpi",
  "elements": {
    "dc.source": {
      "selector": "//meta[@name='dc.source']",
      "attribute": "content"
    },
    "figure_img": {
      "selector": "//div[contains(@id, 'fig')]/div/img",
      "attribute": "src",
      "download": true
    },
    "figure_caption": {
      "selector": "//div[contains(@class, 'html-fig_description')]"
 
    },
    "fulltext_pdf": {
      "selector": "//meta[@name='citation_pdf_url']",
      "attribute": "content",
      "download": true
    },
    "fulltext_html": {
      "selector": "//meta[@name='citation_fulltext_html_url']",
      "attribute": "content",
      "download": true
    },
    "title": {
      "selector": "//meta[@name='citation_title']",
      "attribute": "content"
    },
    "author": {
      "selector": "//meta[@name='citation_author']",
      "attribute": "content"
    },
    "date": {
      "selector": "//meta[@name='citation_date']",
      "attribute": "content"
    },
    "doi": {
      "selector": "//meta[@name='citation_doi']",
      "attribute": "content"
    },
    "volume": {
      "selector": "//meta[@name='citation_volume']",
      "attribute": "content"
    },
    "issue": {
      "selector": "//meta[@name='citation_issue']",
      "attribute": "content"
    },
    "firstpage": {
      "selector": "//meta[@name='citation_firstpage']",
      "attribute": "content"
    },
    "description": {
      "selector": "//meta[@name='description']",
      "attribute": "content"
  }
}
 }
Now run,
quickscrape --urllist url.txt --scraper journal-scrapers/scrapers/generic_open.json --output my_test

   

Now you are able to scrape data from any url or list of urls. We have option in quickscrape we can give the scraper directory path in command and scrape different type of urls in single command. like

/quickscrape --urllist url.txt --scraperdir journal-scrapers/scrapers/ --output my_test

 

Monday, 21 July 2014

Boiler pipe support with nutch, solr and hbase



Using tika we can integrate nutch with boilerpipe because in tika we have boilerpipe jar.

We have apache tika in our nutch directory /apache-nutch-2.2.1/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 

In plugin folder we have apache-nutch-2.2.1/runtime/local/plugins/parse-tika/boilerpipe-1.1.0.jar file 

Edit nutch-site.xml

<property>
<name>tika.boilerpipe</name>
<value>true</value> </property>
<property>
<name>tika.boilerpipe.extractor</name>
<value>ArticleExtractor</value>
</property>

<property>
            <name>plugin.includes</name>
       <value>parse-tika|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
           <description>Regular expression naming plugin directory names to
           include.  Any plugin not matching this expression is excluded.
            In any case you need at least include the nutch-extensionpoints plugin. By
           default Nutch includes crawling just HTML and plain text via HTTP,
           and basic indexing and search plugins. In order to use HTTPS please enable
            protocol-httpclient, but be aware of possible intermittent problems with the
           underlying commons-httpclient library.
            </description>
           </property>

           Edit parse-plugin.xml

<mimeType name="text/html">
                <plugin id="parse-tika" />
           </mimeType>

            <mimeType name="application/xhtml+xml">
                <plugin id="parse-tika" />
           </mimeType>

Check url using the following command-

$ bin/nutch parsechecker -dumpText [url] > check.log

for e.g.

$ bin/nutch parsechecker -dumpText http://venturebeat.com/> check.log

It will create dump file which store filter content of url.
          



 And when we crawl and index  data using nutch ,solr  and hbase 
we can check that our hbase table data is filtered. Hbase filtered data -






Sunday, 22 June 2014

Big-Data (Apache-Nutch)

Apache-Nutch-2.2.1 with
 Hbase-0.90.4 , Solr-4.8.1



Installation on Mac

About apache nutch: 

Apache Nutch is an open source Web crawler written in Java. By using it, we can find Web page hyperlinks in an automated manner, reduce lots of maintenance work, for example checking broken links, and create a copy of all the visited pages for searching over.

Features
    Fetching and parsing are done separately by default, this reduces the risk of an error corrupting the fetch parse stage of a crawl with Nutch.
    Plugins have been overhauled as a direct result of removal of legacy Lucene dependency for indexing and search.
    The number of plugins for processing various document types being shipped with Nutch has been refined. Plain text, XML, OpenDocument (OpenOffice.org), Microsoft Office (Word, Excel, Powerpoint), PDF, RTF, MP3 (ID3 tags) are all now parsed by the Tika plugin. The only parser plugins shipped with Nutch now are Feed (RSS/Atom), HTML, Ext, JavaScript, SWF, Tika & ZIP.
    MapReduce ;
    Distributed filesystem (via Hadoop)
    Link-graph database
NTLM authentication

About apache solr: 
SolrTM is the popular, blazing fast open source enterprise search platform from the Apache LuceneTMproject. Its major features include powerful full-text search, hit highlighting, faceted search, near real-time indexing, dynamic clustering, database integration, rich document (e.g., Word, PDF) handling, and geospatial search. Solr is highly reliable, scalable and fault tolerant, providing distributed indexing, replication and load-balanced querying, automated failover and recovery, centralized configuration and more. Solr powers the search and navigation features of many of the world's largest.

internet sites.
Solr is written in Java and runs as a standalone full-text search server within a servlet container such as Jetty. Solr uses the Lucene Java search library at its core for full-text indexing and search, and has REST-like HTTP/XML and JSON APIs that make it easy to use from virtually any programming language. Solr's powerful external configuration allows it to be tailored to almost any type of application without Java coding, and it has an extensive plugin architecture when more advanced customization is required.

Features:
    Advanced Full-Text Search Capabilities
    Optimized for High Volume Web Traffic
    Standards Based Open Interfaces - XML, JSON and HTTP
    Comprehensive HTML Administration Interfaces
    Server statistics exposed over JMX for monitoring
    Linearly scalable, auto index replication, auto failover and recovery
    Near Real-time indexing
    Flexible and Adaptable with XML configuration Extensible Plugin Architecture

About apache Hbase: 

HBase is an open source, non-relational, distributed database modeled after Google's BigTable and written in Java. It is developed as part of Apache Software Foundation's Apache Hadoop project and runs on top of HDFS (Hadoop Distributed Filesystem), providing BigTable-like capabilities for Hadoop. That is, it provides a fault-tolerant way of storing large quantities of sparse data (small amounts of information caught within a large collection of empty or unimportant data, such as finding the 50 largest items in a group of 2 billion records, or finding the non-zero items representing less than 0.1% of a huge collection). HBase is a type of "NoSQL" database.
Features:
    Linear and modular scalability.
    Strictly consistent reads and writes.
    Automatic and configurable sharding of tables
    Automatic failover support between RegionServers.

    Convenient base classes for backing Hadoop MapReduce jobs with Apache . HBase tables
    Easy to use Java API for client access.
    Block cache and Bloom Filters for real-time queries.
    Query predicate push down via server side Filters
    Thrift gateway and a REST-ful Web service that supports XML, Protobuf, and binary data encoding options
    Extensible jruby-based (JIRB) shell
    Support for exporting metrics via the Hadoop metrics subsystem to files or Ganglia; or via JMX

Installation

Hbase Installation steps:

Download Hbase using


Untar file
$ tar -vxf hbase-0.90.4.tar.gz

Change /usr/local/Hbase/conf /hbase-site.xml as below

<configuration>
<property>
<name>hbase.rootdir</name>
<value>file:///usr/local/hbase</value>
</property>
<property>
   <name>hbase.zookeeper.quorum</name>
   <value>localhost</value>
</property>

Add JAVA_HOME to /usr/local/Hbase/conf/hbase-env.sh

export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_51.jdk/Contents/Home
export HBASE_OPTS="-Djava.security.krb5.realm= -Djava.security.krb5.kdc="

Start Hbase-

$ ./bin/start-hbase.sh

Check if Hbase install correctly

$ ./bin/hbase shell
HBase Shell; enter 'help<RETURN>' for list of supported commands.
Type "exit<RETURN>" to leave the HBase Shell
Version 0.90.4, r1150278, Sun Jul 24 15:53:29 PDT 2011

Create table

hbase(main):001:0> create 'test', 'cf'
0 row(s) in 0.4340 seconds

Check table

hbase(main):002:0> list 'test'
TABLE                                                                          
test                                                                            
1 row(s) in 0.0580 seconds

Put data on table

hbase(main):003:0> put 'test', 'row1', 'cf:a', 'value1'
0 row(s) in 0.2130 seconds

hbase(main):004:0> put 'test', 'row2', 'cf:b', 'value2'
0 row(s) in 0.0140 seconds

hbase(main):005:0> put 'test', 'row3', 'cf:c', 'value3'
0 row(s) in 0.0130 seconds

Check records of table

hbase(main):006:0> scan 'test'
ROW                   COLUMN+CELL                                              
 row1                 column=cf:a, timestamp=1403154436134, value=value1       
 row2                 column=cf:b, timestamp=1403154448918, value=value2       
 row3                 column=cf:c, timestamp=1403154456718, value=value3       
3 row(s) in 0.0910 seconds


hbase(main):008:0> exit

Stop Hbase-

$ ./bin/stop-hbase.sh




Apache Nutch Installation steps:

Download apache-nutch-2.2.1 from


Extract apache-nutch-2.2.1-src.tar.gz file

Move downloaded file to the directory-

$ mv apache-nutch-2.2.1 /usr/local/

Edit usr/local/apache-nutch-2.2.1/conf/nutch-site.xml file

<configuration>

    <property>
        <name>storage.data.store.class</name>
        <value>org.apache.gora.hbase.store.HBaseStore</value>
        <description>Default class for storing data</description>
    </property>
    <property>
        <name>http.agent.name</name>
        <value>NutchCrawler</value>
    </property>
    <property>
        <name>http.robots.agents</name>
        <value>NutchCrawler,*</value>
    </property>
</configuration>

Edit usr/local/apache-nutch-2.2.1/conf/hbase-site.xml file

<configuration>
<property>
<name>hbase.rootdir</name>
<value>file:///usr/local/hbase</value>
</property>
<property>
   <name>hbase.zookeeper.quorum</name>
   <value>localhost</value>
</property>


<property>
   <name>hbase.zookeeper.property.clientPort</name>
   <value>2181</value>
</property>
</configuration>

Edit usr/local/apache-nutch-2.2.1/conf/gora.properties

gora.datastore.default=org.apache.gora.hbase.store.HBaseStore

Uncomment the /usr/local/apache-nutch-2.2.1/ivy/ivy.xml

<dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />

Edit usr/local/apache-nutch-2.2.1/conf/regex-urlfiter.txt
+^http://work-at-google.com

Run ant

$ ant clean
$ ant runtime

It  will create runtime folder in apache-nutch-2.2.1 folder

usr/local/apache-nutch-2.2.1/runtime

Create directory /usr/local/apache-nutch-2.2.1/runtime/local
$ mkdir urls
$ echo "http://work-at-google.com" > urls/seed.txt


Set path for Java_Home

$ export JAVA_HOME="$(/usr/libexec/java_home)"

Crawl with Nutch

$ bin/nutch inject urls
$ bin/nutch generate -topN 5
$ bin/nutch fetch -all
$ bin/nutch parse -all
$ bin/nutch updated
$ bin/nutch readdb











After running these steps it will create webpage folder in hbase which is table of hbase and store all crawl, fetch data.


Apache Solr Installation steps:

                       Download Solr

                       $ brew install solr
Start Solr
   
$ cd usr/local/Cellar/solr/4.8.1/libexec/example/

$ java -jar start.jar

                        Solr is running we can check.
      http://localhost:8983/solr/admin/


           Now feed the data solr with nutch
   
           $ bin/nutch solrindex http://localhost:8983/solr/ -all

       Using the crawl script

           $ bin/crawl urls/seed.txt testCrawl localhost:8983/solr/ 2


           After running this command it will create testCrawl_webpage folder in hbase which is table of hbase and store all data.


      Now we can  search over data in Solr
    
            http://localhost:8983/solr/#/collection1/query



Apache Nutch 2.x Commands:

                        $ bin/nutch readdb
                         (Read/dump crawl db)
Usage: WebTableReader (-stats | -url [url] | -dump <out_dir> [-regex regex])
                      [-crawlId <id>] [-content] [-headers] [-links] [-text]
    -crawlId <id>  - the id to prefix the schemas to operate on,
                     (default: storage.crawl.id)
    -stats [-sort] - print overall statistics to System.out
    [-sort]        - list status sorted by host
    -url <url>     - print information on <url> to System.out
    -dump <out_dir> [-regex regex] - dump the webtable to a text file in
                     <out_dir>
    -content       - dump also raw content
    -headers       - dump protocol headers
    -links         - dump links
    -text          - dump extracted text
                        [-regex]       - filter on the URL of the webtable entry



                        $ bin/nutch inject
                         (Inject new urls into the database)
                         Usage: InjectorJob <url_dir> [-crawlId <id>]

                        $ bin/nutch hostinject
                         (Inject new urls into the hostdatabase)

                       $ bin/nutch generate
                        (Generate new segments to fetch from crawldb)
 Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]
    -topN <N>      - number of top URLs to be selected, default is Long.MAX_VALUE
    -crawlId <id>  - the id to prefix the schemas to operate on,
                    (default: storage.crawl.id)");
    -noFilter      - do not activate the filter plugin to filter the url, default is true
    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true
    -adddays       - Adds numDays to the current time to facilitate crawling urls already fetched sooner then db.default.fetch.interval. Default value is 0.


                       $ bin/nutch fetch
                       (Fetch a segment's pages)
Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] [-threads N] [-resume] [-numTasks N]
       <batchId>     - crawl identifier returned by Generator, or -all for all
                    generated batchId-s
       -crawlId <id> - the id to prefix the schemas to operate on,
                    (default: storage.crawl.id)
       -threads N    - number of fetching threads per task
       -resume       - resume interrupted job
       -numTasks N   - if N > 0 then use this many reduce tasks for fetching
                    (default: mapred.map.tasks)


                       $ bin/nutch parse
                      (Parse a segment's pages)
Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]
    <batchId>     - symbolic batch ID created by Generator
    -crawlId <id> - the id to prefix the schemas to operate on,
                    (default: storage.crawl.id)
    -all          - consider pages from all crawl jobs
    -resume       - resume a previous incomplete job
    -force        - force re-parsing even if a page is already parsed

                      $ bin/nutch updatedb
                      (Update crawldb after fetching)

                     $ bin/nutch updatehostdb
                     (Update hostdb after fetching)

                     $ bin/nutch elasticindex
(Run the elastic search indexer on parsed batches)

                      $ bin/nutch solrindex
                     (Run the solr indexer on parsed segments and linkdb)
                     Usage: SolrIndexerJob <solr url> (<batchId> | -all | -reindex) [-crawlId <id>]

                       $ bin/nutch parsechecker
                       (Checks the parser for a given url)
$   bin/nutch plugin
(Loads a plugin and run one of its classes main())

$ bin/nutch NutchServer
(run a (local) Nutch server on a user defined port)
usage: NutchServer [-help] [-log <loging level>] [-port] [-stop <force>]
 -help                 Show this help
 -log <loging level>   Select a logging level for the
                       NutchServer.ALL|CONFIG|FINER|FINEST|INFO|OFF|SEVERE
                       |WARNING
 -port                 Use port for restful API
 -stop <force>         Stop running nutch server. Force stops server
                       despite running jobs

$ bin/nutch junit
             (Runs the given JUnit test)

           $  bin/nutch  CLASSNAME
           (run the class named CLASSNAME)