'''Script translasi dokumen OpenOffice Writer/Web untuk AdhiHargo.net

Script ini menjelajahi secara rekursif direktori situs, mengekstrak semua teks
dalam tag <BODY></BODY> dari setiap file berekstensi .odt.html ke file
berekstensi .skl.html, untuk kemudian diproses oleh script makepage.py.
Menggunakan modifikasi kelas HTMLProcessor dari buku "Dive Into Python"-nya Mark
Pilgrim (diveintopython.org).

*   Hanya menterjemahkan file-file yang telah berubah setelah terjemahan
    terakhir (jadi seperti make).
*   Menghapus atribut-atribut HTML yang tidak diinginkan.

Author          :   Adhi Hargo
Last modified   :   15/01/2007 3:49:37
'''
# ==============================================================================
#                           HTML Processor
# ==============================================================================

from sgmllib import SGMLParser
import htmlentitydefs, sys, os, stat
from _skel import *

forbidattr = ['width','height']

class HTMLProcessor(SGMLParser):
    '''Kelas pemroses HTML dasar dari buku Mark Pilgrim, Dive into Python".

    Perubahan hanya untuk menangani tag h1 yang diasumsikan sebagai judul.'''
    def reset(self):
        SGMLParser.reset(self)
        self.in_body = False
        self.in_title = False
        self.pieces = []
        self.title = ""
    def unknown_starttag(self, tag, attrs):
        if not self.in_body: return
        strattrs = ''.join([' %s="%s"'
            % (k,v) for (k,v) in attrs if not k in forbidattr])
        self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
    def unknown_endtag(self, tag):
        if not self.in_body: return
        self.pieces.append('</%(tag)s>' % locals())
    def handle_charref(self, ref):
        if not self.in_body: return
        self.pieces.append("&#%(ref)s;" % locals())
    def handle_comment(self, text):
        if not self.in_body: return
        self.pieces.append("<!--%(text)s-->" % locals())
    def handle_data(self, text):
        if not self.in_body: return
        if self.in_title: self.title = ' '.join(text.split())
        self.pieces.append(text)
    def handle_decl(self, text):
        if not self.in_body: return
        self.pieces.append("<!%(text)s>" % locals())
    def handle_entityref(self, ref):
        if not self.in_body: return
        self.pieces.append("&%(ref)s" % locals())
        if htmlentitydefs.entitydefs.has_key(ref):
            self.pieces.append(";")
    def handle_pi(self, text):
        if not self.in_body: return
        self.pieces.append("<?%(text)s>" % locals())
    def start_body(self, attrs):
        self.in_body = True
    def end_body(self):
        self.in_body = False
    def start_h1(self, attrs):  # Asumsi sederhana: cuma ada 1 judul terbesar
        strattrs = ''.join([' %s="%s"' % (k,v) for (k,v) in attrs])
        self.pieces.append("<h1 %s>" % strattrs)
        self.in_title = True
    def end_h1(self):
        self.pieces.append("</h1>")
        self.in_title = False

# ==============================================================================

ODTFILEEXT = '.odt.html'
DATFILEEXT = '.isi'
out = sys.stdout

def extractpages(rootdir, force=False):
    parser      = HTMLProcessor()
    compiler    = Compiler()
    odtlist     = None;
    odtpath     = None; dstpath     = None;
    odtfile     = None; dstfile     = None;
    isi_dict    = {}

    for curdir, dirs, files in os.walk(rootdir):
        odtlist = [f for f in files if f.endswith(ODTFILEEXT)]
        for odtstr in odtlist:
            odtpath = os.path.join(curdir,odtstr)
            dstpath = os.path.join(curdir,odtstr.replace(ODTFILEEXT,DATFILEEXT))
            # Kalau file dokumen lebih baru dari file target...
            if (not os.path.exists(dstpath)) or \
                force or \
                (os.stat(odtpath)[stat.ST_MTIME] > os.stat(dstpath)[stat.ST_MTIME]):
                odtfile = file(odtpath,'rb')
                dstfile = file(dstpath,'wb')
                parser.pieces.append('<small><a href="%s">Plain version</a></small>' % odtstr)
                parser.feed(odtfile.read())
                isi_dict['JUDULHALAMAN'] = parser.title
                isi_dict['ISIHALAMAN'] = ''.join(parser.pieces)
                compiler.feed(isi_dict)
                dstfile.writelines(compiler.strings)
                parser.reset()
                compiler.reset()
                odtfile.close()
                dstfile.close()
            else: out.write("%s belum berubah.\n" % odtstr)

if __name__ == '__main__':
    args = sys.argv[1:]
    if args:    rootdir = os.path.abspath(args[0])
    else:       rootdir = os.getcwd()

    extractpages(rootdir, True)