eb6ea966790a — QuintillusCFC 5 years ago
Parse out the structure of the Gopher file.
A => src/main/java/com/ajtjp/gopherarchiver/DocumentLine.java +33 -0
@@ 0,0 1,33 @@ 
+
+package com.ajtjp.gopherarchiver;
+
+/**
+ *
+ * @author Andrew
+ */
+public class DocumentLine {
+    String type;    //todo: Enum
+    String displayText;
+    String selector;
+    String host;
+    int port;
+    
+    public DocumentLine(String typeAndText, String selector, String host, String port) {
+        type = typeAndText.substring(0, 1);
+        displayText = typeAndText.substring(1);
+        this.selector = selector;
+        this.host = host;
+        try {
+            this.port = Integer.valueOf(port);
+        }
+        catch(NumberFormatException ex) {
+            this.port = 70;
+        }
+    }
+    
+    private DocumentLine(String type) {
+        this.type = type;
+    }
+    
+    public static final DocumentLine TERMINATOR = new DocumentLine(".");
+}

          
M src/main/java/com/ajtjp/gopherarchiver/Gopher.java +51 -4
@@ 5,7 5,9 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.net.Socket;
-import javax.net.SocketFactory;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
 
 /**
  *

          
@@ 25,12 27,18 @@ public class Gopher {
     }
     
     public Gopher() throws IOException, InterruptedException {
-        downloadPageContents(ReferenceURLs.floodgap);
-        
+        //TODO: It probably is a good idea to parse content as we go.
+        //That would be a good why of detecting the terminator; trade-off is it
+        //might make it slightly harder to archive the raw content.
+        //Postponing that decision till later.
+        String pageContents = downloadPageContents(ReferenceURLs.floodgap);
+        List<DocumentLine> parsedContents = new ArrayList<>();
+        parseContents(pageContents, parsedContents);
         System.out.println("Read total of " + bytesRead + " bytes");
     }
     
     private String downloadPageContents(GopherURL url) throws IOException, InterruptedException {
+        StringBuilder sb = new StringBuilder();
         Socket s = new Socket(url.host, url.port);
         OutputStream os = s.getOutputStream();
         os.write((url.selector + "\r\n").getBytes());

          
@@ 50,6 58,7 @@ fetch:  for (;;) {
                 bytesRead+=actual;
                 String valueRead = new String(buffer, "Windows-1252");
                 System.out.println(valueRead);
+                sb.append(valueRead);
             }
 wait:       for (;;) {
                 Thread.sleep(10);

          
@@ 63,6 72,44 @@ wait:       for (;;) {
             }
             break;
         }
-        return "";
+        return sb.toString();
+    }
+    
+    private void parseContents(String pageContents, List<DocumentLine> documentLines) {
+        Pattern pattern = Pattern.compile("\r\n");
+        String[] pageLines = pageContents.split("\r\n");
+        for (String line : pageLines) {
+            String[] segments = line.split("\t");
+            if (segments.length >= 4) {
+                try {
+                    DocumentLine parsedLine = new DocumentLine(segments[0], segments[1], segments[2], segments[3]);
+                    documentLines.add(parsedLine);
+                    System.out.println("Line segments");
+                }
+                catch(Exception ex) {
+                    System.err.println(":(");
+                }
+            }
+            else if (segments.length == 1) {
+                if (segments[0].equals(".")) {
+                    documentLines.add(DocumentLine.TERMINATOR);
+                }
+            }
+        }
+        System.out.println("Lines");
+    }
+    
+    //The server responds with a block of text terminated with a period on a 
+    //line by itself, and closes the connection.
+    //TODO: Properly implement detection of the trailing bytes, rather than simply
+    //waiting up to a second for the end.
+    final static byte[] terminatingBytes = new byte[5];
+    
+    static {
+        terminatingBytes[0] = 0x0d;
+        terminatingBytes[1] = 0x0a;
+        terminatingBytes[2] = 0x2e;
+        terminatingBytes[3] = 0x0d;
+        terminatingBytes[4] = 0x0a;
     }
 }