# HG changeset patch # User QuintillusCFC # Date 1549502021 18000 # Wed Feb 06 20:13:41 2019 -0500 # Node ID eb6ea966790a4e027662aa0edd8ba61d0fd02c07 # Parent a8595a79248455998f9aeeeac1f51ad78aa96296 Parse out the structure of the Gopher file. diff --git a/src/main/java/com/ajtjp/gopherarchiver/DocumentLine.java b/src/main/java/com/ajtjp/gopherarchiver/DocumentLine.java new file mode 100644 --- /dev/null +++ b/src/main/java/com/ajtjp/gopherarchiver/DocumentLine.java @@ -0,0 +1,33 @@ + +package com.ajtjp.gopherarchiver; + +/** + * + * @author Andrew + */ +public class DocumentLine { + String type; //todo: Enum + String displayText; + String selector; + String host; + int port; + + public DocumentLine(String typeAndText, String selector, String host, String port) { + type = typeAndText.substring(0, 1); + displayText = typeAndText.substring(1); + this.selector = selector; + this.host = host; + try { + this.port = Integer.valueOf(port); + } + catch(NumberFormatException ex) { + this.port = 70; + } + } + + private DocumentLine(String type) { + this.type = type; + } + + public static final DocumentLine TERMINATOR = new DocumentLine("."); +} diff --git a/src/main/java/com/ajtjp/gopherarchiver/Gopher.java b/src/main/java/com/ajtjp/gopherarchiver/Gopher.java --- a/src/main/java/com/ajtjp/gopherarchiver/Gopher.java +++ b/src/main/java/com/ajtjp/gopherarchiver/Gopher.java @@ -5,7 +5,9 @@ import java.io.InputStream; import java.io.OutputStream; import java.net.Socket; -import javax.net.SocketFactory; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; /** * @@ -25,12 +27,18 @@ } public Gopher() throws IOException, InterruptedException { - downloadPageContents(ReferenceURLs.floodgap); - + //TODO: It probably is a good idea to parse content as we go. + //That would be a good why of detecting the terminator; trade-off is it + //might make it slightly harder to archive the raw content. + //Postponing that decision till later. + String pageContents = downloadPageContents(ReferenceURLs.floodgap); + List parsedContents = new ArrayList<>(); + parseContents(pageContents, parsedContents); System.out.println("Read total of " + bytesRead + " bytes"); } private String downloadPageContents(GopherURL url) throws IOException, InterruptedException { + StringBuilder sb = new StringBuilder(); Socket s = new Socket(url.host, url.port); OutputStream os = s.getOutputStream(); os.write((url.selector + "\r\n").getBytes()); @@ -50,6 +58,7 @@ bytesRead+=actual; String valueRead = new String(buffer, "Windows-1252"); System.out.println(valueRead); + sb.append(valueRead); } wait: for (;;) { Thread.sleep(10); @@ -63,6 +72,44 @@ } break; } - return ""; + return sb.toString(); + } + + private void parseContents(String pageContents, List documentLines) { + Pattern pattern = Pattern.compile("\r\n"); + String[] pageLines = pageContents.split("\r\n"); + for (String line : pageLines) { + String[] segments = line.split("\t"); + if (segments.length >= 4) { + try { + DocumentLine parsedLine = new DocumentLine(segments[0], segments[1], segments[2], segments[3]); + documentLines.add(parsedLine); + System.out.println("Line segments"); + } + catch(Exception ex) { + System.err.println(":("); + } + } + else if (segments.length == 1) { + if (segments[0].equals(".")) { + documentLines.add(DocumentLine.TERMINATOR); + } + } + } + System.out.println("Lines"); + } + + //The server responds with a block of text terminated with a period on a + //line by itself, and closes the connection. + //TODO: Properly implement detection of the trailing bytes, rather than simply + //waiting up to a second for the end. + final static byte[] terminatingBytes = new byte[5]; + + static { + terminatingBytes[0] = 0x0d; + terminatingBytes[1] = 0x0a; + terminatingBytes[2] = 0x2e; + terminatingBytes[3] = 0x0d; + terminatingBytes[4] = 0x0a; } }