#!/bin/sh

#
# SUBS
#

parse_options()
{
	local OPT _proxylist

	while getopts ap:P:vw: OPT; do
		# escape meta
		OPTARG=${OPTARG%%[;\\\$]*}

		case ${OPT} in
		a)	all=yes ;;
		p)	_proxylist="${OPTARG}"
			if [ -r "${_proxylist}" ]; then	# file
				proxylist=$(cat "${_proxylist}")
			else	# list
				proxylist=$(echo "${_proxylist}" | sed -e 's/,/ /g')
			fi
			;;
		P)	pageprefix="${OPTARG}" ;;
		v)	verbose=yes ;;
		w)	pagewidth="${OPTARG}" ;;
		*)	usage ;;
		esac
	done

	OPTC=$((${OPTIND} - 1))
}

err()
{
	local exitval=$1

	shift
	echo 1>&2 "${0##*/}: $*"
	exit ${exitval}
}

#
# returns true if argument is a positive/negative whole integer.
# stolen from bsdinstall
#
isinteger()
{
	local arg="${1#-}"

	[ -z ${arg} ] && err 3 'isinteger(): bad syntax'

	[ "${arg}" = "${arg%[!0-9]*}" ]
}

usage()
{
	echo "usage: ${0##*/} [-ahPpvw] [numpages] bookid"
	echo '	-h display this help'
	echo '	-a all mode (try to get links from all pages, including already downloaded)'
	echo '	-P pageprefix when numpages specified (*PA, PP, PR, PT)'
	echo '	-p https://proxy.tld:port,proxy.tld,ip:port | https-proxylist.txt'
	echo '	-v verbose'
	echo '	-w pagewidth (800, *1024, 1280, 1440, 1680, ... 2500)'
	echo
	exit 1
}

#
# shows progress in dots/got_page numbers
# stolen from portsnap
#
progress()
{
	local page=$1

	[ -z ${page} ] && err 3 'progress(): bad syntax'

	if [ $((${page} % 10)) -eq 0 -a "${lastchar}" = '.' ]; then
		echo -n ${page}
	elif [ $((${page} % 2)) -eq 0 ]; then
		echo -n .
	fi
}

#
# out $msg $verbose_msg
#
out()
{
	local msg="$1" verbose_msg="$2"

	[ -z "${msg}" -a -z "${verbose_msg}" ] && err 3 'out(): bad syntax'

	if [ -n "${verbose}" -a -n "${verbose_msg}" ]; then
		echo ${verbose_msg}
	elif [ -z "${verbose}" -a ! -z "${msg}" ]; then
		[ "${msg}" = '.' ] && lastchar=.
		case ${lastchar} in
		[.ce]) printf "${msg}" ;;
			*) printf " ${msg}" ;;
		esac
		lastchar=${msg#${msg%?}}
	fi
}

get_cookie()
{
	# get cookie
	unset cookie_str
	cookie_str=$(wget ${optcommon} -S -U"${ua}" -O/dev/null \
	 --header 'Cookie: CONSENT=YES+' \
	 "${baseurl}${bookid}&pg=PA1&jscmd=click3" 2>&1 | \
	sed -ne '/Set-Cookie:/s/^.*\(NID[^=]*=.*domain=.google.com; HttpOnly\).*$/\1/p')

	# fail only if cookie has wrong format
	# don't care about non-zero exitcode, redirection from google.com
	# to national googletld can fail, especially if under the proxy
	if [ -z "${cookie_str}" ]; then
		out 'E\n' "cannot get cookie: ${cookie_str}"
		return 1
	fi

	# show cookie
	out 'c' "cookie: ${cookie_str}"
}

get_page()
{
	local url urls _return page=$1

	[ -z ${page} ] && err 3 'get_page(): bad syntax'

	# pull signatures only from missing pages unless in all mode
	[ -f "${bookid}/${page}.png" -a -z "${all}" ] && return

	# change cookie every 100 pages tried
	if [ $((${got_pages} % 100)) -eq 0 ]; then
		get_cookie || return 1
	fi
	got_pages=$((${got_pages} + 1))

	url="${baseurl}${bookid}&pg=${page}&jscmd=click3"
	out "$(progress ${got_pages})" "${page}: ${url} TRY"

	# fetch urls
	# NB! signatures tied to cookie and ip
	urls=$(wget ${optcommon} -U"${ua}" --header "Cookie: ${cookie_str}" -O- \
		--header 'Cookie: CONSENT=YES+' \
		"${url}" | tr '}' '\n' | \
		sed -ne 's/^.*"src":"\(https:\/\/[^"]*\)".*$/\1/; /pg=/s/\\u0026/\&/gp')

	# fetch pages
	for url in ${urls}; do
		page=${url##*&pg=}; page=${page%%&*}

		# check again if page already downloaded, we usually get a few
		# urls from a single request
		if [ ! -f "${bookid}/${page}.png" ]; then
			got_pages=$((${got_pages} + 1))

			wget ${optcommon} -U"${ua}" --header "Cookie: ${cookie_str}" \
				--header 'Cookie: CONSENT=YES+' \
				-O"${bookid}/${page}.png" "${url}&w=${pagewidth}"

			_return=$?
			if [ ${_return} -ne 0 ]; then
				# sometime google books returns 404
				rm "${bookid}/${page}.png"
				out 'e' "${page}: ${url}&w=${pagewidth} ERROR"
			else
				out "${page}" "${page}: ${url}&w=${pagewidth} DOWNLOADED"
			fi
		else
			out '' "${page}: ${url}&w=${pagewidth} ALREADY"
		fi
	done
}

get_pages()
{
	local page got_pages=1
	# for out(), progress()
	local lastchar=.

	# randomize page requests - google books only shows 200 - 300 urls
	# in one session. also remove duplicates.
	PAGELIST=$(echo ${PAGELIST} | tr ' ' '\n' | sort -u -R)

	get_cookie || return 1

	for page in ${PAGELIST}; do
		get_page ${page} || return 1
	done
}

#
# returns number of already retrieved pages in retpages
#
get_retpages()
{
	local RETPAGELIST="$(echo "${bookid}"/P*)"

	retpages=$(set -- ${RETPAGELIST}; echo $#)
	# bookid dir is empty
	[ ${retpages} -eq 1 -a "${RETPAGELIST}" != "${RETPAGELIST%\*}" ] && \
		retpages=0
}

get_pagelist()
{
	local _return url="${baseurl}${bookid}&pg=PA1&jscmd=click3"

	# autodetect available pages
	# XXX there can be a few not listed pages
	PAGELIST=$(wget ${optcommon} -U"${ua}" -O- "${url}" | \
		tr '}' '\n' | sed -ne 's/^.*"[Pp]id":"\(P[^"]*\)".*/\1/p' | sort -u)

	# fail if non-zero exitcode returned or PAGELIST is empty
	_return=$?
	if [ ${_return} -ne 0 -o -z "${PAGELIST}" ]; then
		err 2 'cannot autodetect available pages, try to set numpages manually'
	fi

	# XXX hack: add PP1-3, PT1-2 to PAGELIST manually
	PAGELIST="${PAGELIST} PP1 PP2 PP3 PT1 PT2"
}

#
# create PAGELIST, only for pageprefix
#
make_pagelist()
{
	local page=1 numpages=$1

	[ -z ${numpages} ] && err 3 'make_pagelist(): bad syntax'

	PAGELIST=
	while [ ${page} -le ${numpages} ]; do
		PAGELIST="${PAGELIST} ${pageprefix}${page}"
		page=$((${page} + 1))
	done
}

#
# MAIN
#

# with wrong UserAgent google returns 401 Unauthorized
# ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'

baseurl='https://books.google.com/books?id='

# common wget options
optcommon='-T5 -t3 -q'

# default page width
pagewidth=1024

# PA - books pages
# PR - preface, contents 	~20
# PP,PT - front, back title	~4
# default page prefix for numpages
pageprefix=PA

# remove possible environment pollution
unset http_proxy https_proxy

parse_options "$@"
shift ${OPTC}

isinteger "${pagewidth}" || err 4 "pagewidth must be integer: ${pagewidth}"

[ -z $1 ] && usage

# fallback to old enumeration method if numpages provided
if isinteger $1; then
	numpages=$1

	[ -z $2 ] && usage
	bookid=$2

	make_pagelist ${numpages}
else
	bookid=$1
	get_pagelist
	numpages=$(set -- ${PAGELIST}; echo $#)
fi

# if bookid dir already exist, continue from previous try
if [ ! -d "${bookid}" ]; then
	mkdir -- "${bookid}" || err 2 "cannot create dir ${bookid}"
fi

get_retpages
echo "pages available/fetched: ${numpages}+/${retpages}"

if [ -z "${proxylist}" ]; then
	get_pages
	echo
else
	for https_proxy in ${proxylist}; do
		echo "using proxy ${https_proxy}"
		export https_proxy
		get_pages
		echo
	done
fi
