Program to scrape urls from google and save into the file.
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetURLs {
public static void main(String[] args) throws Exception {
int i=0;
do{i=i+1;
i++;
//set your query here as you need
String query = "https://www.google.co.in/webhp?sourceid=chrome-instant&rlz=1C1DFOC_enIN573IN573&ion=1&espv=2&ie=UTF-8#q=javabooks
URLConnection connection = new URL(query).openConnection();
connection.setRequestProperty("User-Agent", "Chrome/37.0.2062.94");
connection.connect();
URL url = connection.getURL();
BufferedReader r = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
StringBuffer sb = new StringBuffer();
String line;
Pattern trimmer = Pattern.compile("\"http(.*?)\">");
Matcher m = null;
while ((line = r.readLine()) != null) {
m = trimmer.matcher(line);
if(m.find()){
break;
}}
while ((line = r.readLine()) != null) {
m = trimmer.matcher(line);
if(m.find()){
if(line.indexOf("http")!=-1){
line=line.substring(line.indexOf("http"));
line=line.substring(0,line.indexOf("\">"));
sb.append(line+"\n");
}
}
}
System.out.println(sb.toString());
list.add(sb.toString());
try {
Thread.sleep(5 * 1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}while(i<=3);
File file = new File("/Users/bigdata/Desktop/mytest.txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
for (String link : list) {
bw.write(link+"\n");
}
bw.close();
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetURLs {
public static void main(String[] args) throws Exception {
int i=0;
do{i=i+1;
i++;
//set your query here as you need
String query = "https://www.google.co.in/webhp?sourceid=chrome-instant&rlz=1C1DFOC_enIN573IN573&ion=1&espv=2&ie=UTF-8#q=javabooks
URLConnection connection = new URL(query).openConnection();
connection.setRequestProperty("User-Agent", "Chrome/37.0.2062.94");
connection.connect();
URL url = connection.getURL();
BufferedReader r = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
StringBuffer sb = new StringBuffer();
String line;
Pattern trimmer = Pattern.compile("\"http(.*?)\">");
Matcher m = null;
while ((line = r.readLine()) != null) {
m = trimmer.matcher(line);
if(m.find()){
break;
}}
while ((line = r.readLine()) != null) {
m = trimmer.matcher(line);
if(m.find()){
if(line.indexOf("http")!=-1){
line=line.substring(line.indexOf("http"));
line=line.substring(0,line.indexOf("\">"));
sb.append(line+"\n");
}
}
}
System.out.println(sb.toString());
list.add(sb.toString());
try {
Thread.sleep(5 * 1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}while(i<=3);
File file = new File("/Users/bigdata/Desktop/mytest.txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
for (String link : list) {
bw.write(link+"\n");
}
bw.close();
}
No comments:
Post a Comment