Answer the question
In order to leave comments, you need to log in
How to navigate to internal site links via java.net.Socket?
Hello, the task is to get all the internal links on the site. Initially, I get all the links on the page for the specified domain, then for each received page I do the same until the internal links run out.
import java.io.IOException;
import java.util.Set;
public class Main {
public static void main(String[] args) throws IOException {
String url ="ssau.ru"; //как пример
int port = 80;
LinksParser linksParser = new LinksParser(url,port);
Set<String> set = linksParser.getLinks();
for (String s : set) {
LinksParser lp = new LinksParser(s,port);
set.addAll(lp.getLinks());
}
}
}
import java.io.IOException;
import java.io.PrintWriter;
import java.net.Socket;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinksParser {
private String url;
private int port;
public LinksParser(String url){
this.url = url;
this.port = 80;
}
public LinksParser(String url, int port){
this(url);
this.port = port;
}
public Set<String> getLinks() throws IOException {
Set<String> set = new HashSet<>();
Socket socket = new Socket(url,80);
Scanner reader = new Scanner(socket.getInputStream());
PrintWriter writer = new PrintWriter(socket.getOutputStream());
writer.println("GET / HTTP/1.1");
writer.print("Host: ");
writer.println(url);
writer.println("");
writer.flush();
String response ="";
while (reader.hasNext()){
response+=reader.nextLine();
}
System.out.println(response);
Pattern pattern = Pattern.compile("(?<=(?i)href\\s{0,1}=\\s{0,1}\").*?(?=\")");
Matcher matcher = pattern.matcher(response);
while (matcher.find()){
String link = matcher.group();
if(link.contains(".")||link.contains("mailto:")) continue;
if(!link.contains(url)) link = url+link;
if(link.contains("#")){
int index = link.indexOf("#");
link = link.substring(0,index);
}
if(!set.contains(link))set.add(link);
}
writer.close();
reader.close();
socket.close();
return set;
}
}
Exception in thread "main" java.net.UnknownHostException: ssau.ru/education/abitur/
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at java.net.Socket.connect(Socket.java:538)
at java.net.Socket.<init>(Socket.java:434)
at java.net.Socket.<init>(Socket.java:211)
at LinksParser.getLinks(LinksParser.java:33)
at Main.main(Main.java:25)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140)
Answer the question
In order to leave comments, you need to log in
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question