M README.txt +33 -20
@@ 3,33 3,46 @@ This autocompare slowly eats memory, at
mkdir run1
cp page-report.csv run1
time autocompare -threads=8 -retries=3 run1/page-report.csv 2> run1/errs.txt | tee run1/out.csv
- ./scripts/memory | tee run1/memory.txt
+ ./scripts/postprocess run1/page-report.csv
-run1/memory.txt will contain 10-second interval counts of:
- date : system VSZ RSZ .. autocompore VSZ RSZ
-
- ./scripts/filtermissing run1/page-report.csv run1/out.csv
+runX/page-report.csv
+ The input data set from the Kapow bot run, for this run
-run1/page-report.csv.new will contain all of the pages that either have not yet
-been run (not in out.csv), or for which not all of the pages for the site have
-been run.
+runX/page-report.csv.new
+ After processing, this file contains the data that was not run.
+ This can be used as the input for the next run.
-run1/out.csv.new will contain all of the pages for sites which have had all of
-their pages run.
+runX/out.csv
+ The results of autocompare, unfiltered.
- ./scripts/mksites < run1/out.csv.new > run1/sites.csv
+runX/out.csv.new
+ The results of autocompare, filtered to remove pages for sites that
+ weren't fully processed. For example, if processing was interrupted,
+ not all of the pages of a site may have been compared; this file
+ contains only pages for sites where all of the site's pages were
+ compared.
+
+runX/sites.csv
+ The results of autocompare, collated to the site level. Collation rules
+ are:
-run1/sites.csv will contain success counts for all sites aggregated from their
-pages.
-
- ./scripts/stats run1
+ * If any page in the site fails, the site fails
+ * The NRMSD for the site is the highest NRMSD for any page in the site
+ * If any page has failing links, the links column will be false
-dumps a list of statistics about the run.
+runX/mem.dat
+ Memory use statistics about the run.
- ./scripts/retrytimeouts run1 >> run1/page-report.csv.new
+runX/memuse.svgz
+ A graph of the memory use statistics.
-will re-add all of the timeout failures to the new page report for the next run.
+runX/stats.txt
+ Statistics about the run, including failure rates, summaries, and
+ timings.
- ./scripts/runstats run1
+runX/errs.txt
+ Error log. Contains run statistics at the end, but also information
+ about site timeouts and other errors.
-prints out progress statistics.
+runX/att.csv
+ A list of sites that can be sent to AT&T (OK & links OK)
M autocompare/main.go +4 -1
@@ 55,9 55,10 @@ func main() {
// Memory printer
go func() {
mem := new(runtime.MemStats)
+ MB := uint64(1024)
for {
runtime.ReadMemStats(mem)
- log.Printf("memory %d %d %d", mem.Alloc, mem.TotalAlloc, mem.Sys)
+ log.Printf("memory %d %d", mem.Alloc / MB, mem.Sys / MB)
time.Sleep(10 * time.Second)
}
}()
@@ 165,12 166,14 @@ func startProducers(urlStream chan Page)
proc1.Kill()
proc1.Release()
_, proc1 = forkPhantom(k * 2)
+ time.Sleep(2 * time.Second)
}
if proc2TimedOut {
log.Printf("kill/restart %s\n", uri2)
proc2.Kill()
proc2.Release()
_, proc2 = forkPhantom(k*2 + 1)
+ time.Sleep(2 * time.Second)
}
}
pageStream <- page
M scripts/memuse.plot +15 -6
@@ 1,16 1,25 @@
#!/bin/bash
-gnuplot <<EOF
-set datafile separator " "
+if [[ $1 == "" ]]; then
+ echo "USAGE: $0 <dir>"
+ exit 1
+fi
+BASE=$1
+DAT=$BASE/mem.dat
+
+awk '/memory/ {print $2","$4","$6}' < $BASE/errs.txt > $DAT
+
+gnuplot <<EOF
+set datafile separator ","
set term svg enhanced size 1200,900
-set output "$1/memuse.svg"
+set output "${BASE}/memuse.svg"
set ylabel "Memory (KB)"
set xlabel "Time"
set xdata time
-set timefmt "%s"
+set timefmt "%H:%M:%S"
set xtics format "%H:%M:%S"
-plot '$1/mem.txt' using 1:6 title "VSZ" with lines, \
- '$1/mem.txt' using 1:7 title "RSZ" with lines
+plot '${DAT}' using 1:2 title "Alloc" with lines, \
+ '${DAT}' using 1:3 title "System" with lines
EOF
gzip $1/memuse.svg
M scripts/postprocess +31 -10
@@ 1,22 1,43 @@
#!/bin/bash
+# ./scripts/filtermissing run1/page-report.csv run1/out.csv
+#
+# run1/page-report.csv.new will contain all of the pages that either have not yet
+# been run (not in out.csv), or for which not all of the pages for the site have
+# been run.
+#
+# run1/out.csv.new will contain all of the pages for sites which have had all of
+# their pages run.
+#
+# ./scripts/mksites < run1/out.csv.new > run1/sites.csv
+#
+# run1/sites.csv will contain success counts for all sites aggregated from their
+# pages.
+#
+# ./scripts/stats run1
+#
+# dumps a list of statistics about the run.
+#
+# ./scripts/retrytimeouts run1 >> run1/page-report.csv.new
+#
+# will re-add all of the timeout failures to the new page report for the next run.
+#
+# ./scripts/runstats run1
+#
+# prints out progress statistics.
MYPATH=`dirname $0`
-PREV=$1
-if [[ $# -eq 0 ]]; then
- echo "USAGE: $0 <dir>"
+INPUT=$1
+if [[ $# -ne 1 ]]; then
+ echo "USAGE: $0 <input>"
exit 1
fi
+PREV=`dirname $1`
-${MYPATH}/filtermissing ${PREV}/page-report.csv ${PREV}/out.csv
+${MYPATH}/filtermissing ${INPUT} ${PREV}/out.csv
${MYPATH}/mksites < ${PREV}/out.csv.new > ${PREV}/sites.csv
egrep -e 'OK.*true$' ${PREV}/sites.csv | awk -F, '{print $2","$4}' > ${PREV}/att.csv
-${MYPATH}/stats ${PREV} > ${PREV}/stats.txt
+${MYPATH}/stats ${INPUT} > ${PREV}/stats.txt
${MYPATH}/memuse.plot ${PREV}
cp ${MYPATH}/../run_readme.txt ${PREV}/README.txt
zip -r ${PREV}.zip ${PREV}
-
-#mkdir run$RUNNUM
-#mv run$PREV/page-report.csv.new run${RUNNUM}/page-report.csv
-#echo "time autocompare -threads=8 -retries=3 run${RUNNUM}/page-report.csv 2> run${RUNNUM}/errs.txt | tee run${RUNNUM}/out.csv"
-#echo "./scripts/memory | tee run${RUNNUM}/memory.txt"
M scripts/runstats +8 -5
@@ 15,12 15,15 @@ printf "Processed %d / %d (%g%%)\n" $P $
E=`egrep 'FAIL|false' ${BASE}/out.csv | grep -v MIGRATION | wc -l`
printf "Failures: %d (%g%%)\n" $E `dc -e "2k$E $P/100*p"`
-SS=`head -n1 ${BASE}/mem.txt | awk '{print $1}'`
-ES=`tail -n1 ${BASE}/mem.txt | awk '{print $1}'`
+SD=`grep memory ${BASE}/errs.txt | head -n 1 | awk '{print $1" "$2}'`
+SS=`date -d "$SD" +%s`
+ED=`grep memory ${BASE}/errs.txt | tail -n 1 | awk '{print $1" "$2}'`
+ES=`date -d "$ED" +%s`
SECS=$(($ES - $SS))
pph=`dc -e "2k$P $SECS 3600//p"`
printf "Run time: %02d:%02d:%02d (%g p/h)\n" $(($SECS/3600)) $(($SECS%3600/60)) $(($SECS%60)) $pph
-ETA=`dc -e "4k$T $P-$pph/3600*p"`
-ETA=`dc -e "$ETA 1/p"`
-printf "Remaining time est.: %02d:%02d:%02d\n" $(($ETA/3600)) $(($ETA%3600/60)) $(($ETA%60))
+TIMEREMAIN=`dc -e "4k$T $P-$pph/3600*p"`
+TIMEREMAIN=`dc -e "$TIMEREMAIN 1/p"`
+ETA=`date -d "$TIMEREMAIN seconds" +%H:%M:%S`
+printf "Remaining time est.: %02d:%02d:%02d (%s)\n" $(($TIMEREMAIN/3600)) $(($TIMEREMAIN%3600/60)) $(($TIMEREMAIN%60)) $ETA
M scripts/stats +24 -15
@@ 1,15 1,22 @@
-#!/bin/sh
+#!/bin/bash
+
+if [[ $# -ne 1 ]]; then
+ echo "USAGE: $0 <input>"
+ exit 1
+fi
+INPUT=$1
+DIR=`dirname $INPUT`
-TOTAL_PAGE=`wc -l < $1/page-report.csv`
-TOTAL_SITE=`awk -F, '{print $3}' < $1/page-report.csv | sort | uniq | wc -l`
-COMP_PAGE=`wc -l <$1/out.csv.new`
-COMP_SITE=`wc -l <$1/sites.csv`
-FAIL_PAGE=`grep FAIL $1/out.csv.new | grep -v MIGRATION | wc -l`
-FAIL_SITE=`grep FAIL $1/sites.csv | grep -v MIGRATION | wc -l`
-AGGREGATED_FAIL_PAGE=`egrep 'FAIL|false$' $1/out.csv.new | grep -v MIGRATION | wc -l`
-AGGREGATED_FAIL_SITE=`egrep 'FAIL|false$' $1/sites.csv | grep -v MIGRATION | wc -l`
-LINK_FAIL_PAGE=`egrep "false$" $1/out.csv.new | wc -l`
-LINK_FAIL_SITE=`egrep "false$" $1/sites.csv | wc -l`
+TOTAL_PAGE=`wc -l < ${INPUT}`
+TOTAL_SITE=`awk -F, '{print $3}' < ${INPUT} | sort | uniq | wc -l`
+COMP_PAGE=`wc -l <${DIR}/out.csv.new`
+COMP_SITE=`wc -l <${DIR}/sites.csv`
+FAIL_PAGE=`grep FAIL ${DIR}/out.csv.new | grep -v MIGRATION | wc -l`
+FAIL_SITE=`grep FAIL ${DIR}/sites.csv | grep -v MIGRATION | wc -l`
+AGGREGATED_FAIL_PAGE=`egrep 'FAIL|false$' ${DIR}/out.csv.new | grep -v MIGRATION | wc -l`
+AGGREGATED_FAIL_SITE=`egrep 'FAIL|false$' ${DIR}/sites.csv | grep -v MIGRATION | wc -l`
+LINK_FAIL_PAGE=`egrep "false$" ${DIR}/out.csv.new | wc -l`
+LINK_FAIL_SITE=`egrep "false$" ${DIR}/sites.csv | wc -l`
COMP_PAGE_P=`dc -e "$COMP_PAGE $TOTAL_PAGE 2k/100*p"`
COMP_SITE_P=`dc -e "$COMP_SITE $TOTAL_SITE 2k/100*p"`
FAIL_SITE_P=`dc -e "$FAIL_SITE $COMP_SITE 2k/100*p"`
@@ 18,7 25,7 @@ LINK_FAIL_SITE_P=`dc -e "$LINK_FAIL_SITE
LINK_FAIL_PAGE_P=`dc -e "$LINK_FAIL_PAGE $COMP_PAGE 2k/100*p"`
AGGREGATED_FAIL_PAGE_P=`dc -e "2k${AGGREGATED_FAIL_PAGE} ${COMP_PAGE}/100*p"`
AGGREGATED_FAIL_SITE_P=`dc -e "2k${AGGREGATED_FAIL_SITE} ${COMP_SITE}/100*p"`
-TIMEOUTS_PAGE=`egrep 'FAIL timeout' $1/out.csv.new | wc -l`
+TIMEOUTS_PAGE=`egrep 'FAIL timeout' ${DIR}/out.csv.new | wc -l`
TIMEOUTS_PAGE_P=`dc -e "2k$TIMEOUTS_PAGE $COMP_PAGE /100*p"`
printf "Total pages : %d\n" $TOTAL_PAGE
@@ 33,8 40,10 @@ printf "Aggregated page failures: %d (%g
printf "Aggregated site failures: %d (%g%%)\n" $AGGREGATED_FAIL_SITE $AGGREGATED_FAIL_SITE_P
printf "Timeouts : %d (%g%%)\n" $TIMEOUTS_PAGE $TIMEOUTS_PAGE_P
-SS=`head -n1 $1/mem.txt | awk '{print $1}'`
-ES=`tail -n1 $1/mem.txt | awk '{print $1}'`
+SD=`grep memory ${BASE}/errs.txt | head -n 1 | awk '{print $1" "$2}'`
+SS=`date -d "$SD" +%s`
+ED=`grep memory ${BASE}/errs.txt | tail -n 1 | awk '{print $1" "$2}'`
+ES=`date -d "$ED" +%s`
SECS=$(($ES - $SS))
-PPH=`dc -e "2k$COMP_PAGE $SECS 3600//p"`
+PPH=`dc -e "2k$P $SECS 3600//p"`
printf "Run time: %02d:%02d:%02d (%g p/h)\n" $(($SECS/3600)) $(($SECS%3600/60)) $(($SECS%60)) $PPH