@@ 10,6 10,7 @@ import (
"os/exec"
"path/filepath"
"sync"
+ "time"
)
const (
@@ 23,7 24,7 @@ var retries *int
func main() {
fname := flag.String("f", "", "CSV containing pages")
- timeout = flag.Int64("timeout", 60, "Timeout on page load, in seconds")
+ timeout = flag.Int64("timeout", 30, "Timeout on page load, in seconds")
nthreads := flag.Int("threads", 50, "Number of concurrent connections")
retries = flag.Int("retries", 5, "Number of times to retry timed-out pages")
help := flag.Bool("h", false, "Print usage information")
@@ 81,8 82,8 @@ func logger(output <-chan string, waiter
func handler(threadsGroup *sync.WaitGroup, pages chan Page, output chan<- string) {
for page := range pages {
page.Attempts++
- out, ok := fetch(page)
- if !ok {
+ out, retry := fetch(page)
+ if retry {
if page.Attempts < *retries {
page.Attempts++
threadsGroup.Add(1)
@@ 97,10 98,16 @@ func handler(threadsGroup *sync.WaitGrou
}
}
+type Response struct {
+ payload []byte
+ err bool
+}
-func fetch(page Page) (msg string, ok bool) {
+
+func fetch(page Page) (msg string, retry bool) {
waiter := new(sync.WaitGroup)
- results := make(chan []byte, 2)
+ results := make(chan Response, 2)
+ defer close(results)
waiter.Add(2)
go fetchPage(waiter, page.OriginURL(), results)
go fetchPage(waiter, page.CopyURL(), results)
@@ 109,42 116,56 @@ func fetch(page Page) (msg string, ok bo
ores := <-results
cres := <-results
var result string
- ok = false
- if len(ores) == 0 {
- result = "UNABLE TO READ"
- } else if bytes.Compare(ores, cres) == 0 {
+ retry = false
+ if ores.err {
+ result = string(ores.payload)
+ retry = true
+ } else if cres.err {
+ result = string(cres.payload)
+ retry = true
+ } else if bytes.Compare(ores.payload, cres.payload) == 0 {
result = "OK"
- ok = true
- } else if len(ores) == 0 {
- result = "TIMEOUT 0"
- } else if cres == nil {
- result = "TIMEOUT 1"
} else {
- result = "FAIL"
+ result = "FAIL COMPARE"
}
msg = fmt.Sprintf("%s,%s,%s,%d,%s", page.OriginURL(), page.CopyURL(), page.AdID, page.Attempts, result)
return
}
-func fetchPage(waiter *sync.WaitGroup, src string, results chan<- []byte) {
- waiter.Done()
+func fetchPage(waiter *sync.WaitGroup, src string, results chan<- Response) {
+ defer waiter.Done()
h := md5.New()
h.Write([]byte(src))
fname := fmt.Sprintf("autocompare%x.png", h.Sum(nil))
outFileName := filepath.Join(os.TempDir(), fname)
- defer os.Remove(outFileName)
+ doneChan := make(chan bool)
+ timeoutChan := make(chan bool)
+ go func() {
+ time.Sleep(time.Duration(*timeout) * time.Second)
+ timeoutChan <- true
+ }()
cmd := exec.Command("./phantomjs", "render.js", src, outFileName)
- cmd.Run()
- //outBytes, err := cmd.Output()
- //fmt.Printf("%v: \"%s\", %v\n", cmd.Args, string(outBytes), nil)
+ go func() {
+ cmd.Run()
+ //outBytes, err := cmd.Output()
+ //fmt.Printf("%v: \"%s\", %v\n", cmd.Args, string(outBytes), nil)
+ doneChan <- true
+ }()
hash := md5.New()
- outFile, err := os.Open(outFileName)
- if err == nil {
- io.Copy(hash, outFile)
- outFile.Close()
+ select {
+ case <-doneChan:
+ outFile, err := os.Open(outFileName)
+ if err == nil {
+ io.Copy(hash, outFile)
+ outFile.Close()
+ os.Remove(outFileName)
+ }
+ results <- Response{hash.Sum(nil), false}
+ case <-timeoutChan:
+ cmd.Process.Kill()
+ results <- Response{[]byte("TIMEOUT"), true}
}
- results <- hash.Sum(nil)
}
@@ 1,9 1,11 @@
package main
import "fmt"
+import "strings"
type Page struct {
Attempts int
+ Retry bool
Subdomain string
Domain1 string
OrigPath string
@@ 11,12 13,9 @@ type Page struct {
AdID string
}
-func (p Page) Domain() string {
- return fmt.Sprintf("http://%s", p.Domain1)
-}
-
func (p Page) OriginURL() string {
- return fmt.Sprintf("http://%s.paginasamarillas.es%s", p.Domain1, p.OrigPath)
+ newDomain := strings.Replace(p.Domain1, "www", p.Subdomain, 1)
+ return fmt.Sprintf("http://%s%s", newDomain, p.OrigPath)
}
func (p Page) CopyURL() string {