Projects

I don't have a ton of extra time to be spending on projects lately, but if I do anything I'll be sure to put it up here




Redline Routes

Link

This is a website I've created for user-submitted ideal motorcycle routes. It started as just messing around with the Google Maps API, then turned into the site because I couldn't find a decent site with Canadian roads, only US roads.




Bulk HTML Tidy and XHTML 1.0 Strict compliance

bulkTidy.py - Click for source code

#! /usr/bin/python.exe

# Usage: from command line, run following command - dirWalker.py /path/to/scan/
# The path to scan is optional.  If no path is given, current working directory will be used.
# A path to the batch file must be given.  This creates a batch tidy script for all htm files in the directory/directories given
# This will automatically run the batch script and some regular expressions to clean the documents.

from __future__ import generators # needed for Python 2.2
from datetime import datetime # Not really necessary, but used for script timer (for benchmarking)
import sys, os, stat, re, thread

#For files that cannot be opened
cantOpen = 0    #global variable


def walktree(top = ".", depthfirst = True):
    """Walk the directory tree, starting from top. Credit to Noah Spurrier and Doug Fort."""
    import os, stat, types
    names = os.listdir(top)
    if not depthfirst:
        yield top, names
    for name in names:
        try:
            st = os.lstat(os.path.join(top, name))
        except os.error:
            continue
        if stat.S_ISDIR(st.st_mode):
            for (newtop, children) in walktree (os.path.join(top, name), depthfirst):
                yield newtop, children
    if depthfirst:
        yield top, names


#/////////////////////////////////////////////////
# Start A Real Validator creation
#/////////////////////////////////////////////////

def findArvFiles(top, depthfirst=False):
    
    from xml.sax.saxutils import escape # To quote out things like &
    ret = ['\n']

    for top, names in walktree(top):
      if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
         # Prints out the directories found -- testing for above condition
         #print top + "<br />\n"
         for name in names:
               if name.endswith('.htm') or name.endswith('.HTM'):
                  cygDrivePath = top
                  windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
                  # Put into Windows file format for running batch file by double clicking in Windows Explorer
                  windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
                  windowsDrivePath = windowsDrivePath.replace("/","\\")
                  # the last slash should be dependant on whether windowsDrivePath or cygDrivePath
                  if windowsDrivePath.endswith('\\'):
                      ret.append(windowsDrivePath + name + "\n")
                  else:
                      ret.append(windowsDrivePath + "\\" + name + "\n")
    return ''.join(ret) # Much faster than += method

def makeArvFile(top, depthfirst=False):
        if len(sys.argv) < 2 or len(sys.argv) > 3:  # the program name and the one arguments
            # stop the program and print an error message
            sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) /path/to/arv/file/\nIf no path to scan folder is specified, current path is used")
        filePath = '/cygdrive/c/'
        openFile = open(filePath + 'toClean.arv','w+')
        openFile.write('\n'.join([findArvFiles(top, depthfirst)]))
        openFile.close()
        os.chmod(filePath + '/toClean.arv',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
        print "File permissions on ARV file changed successfully\n"
        print "A Real Validator file successfully created at " + filePath + "toClean.arv\n"

#/////////////////////////////////////////////////
# End A Real Validator creation
#/////////////////////////////////////////////////



#/////////////////////////////////////////////////
# Start HTML file creation (for QA purposes)
#/////////////////////////////////////////////////

def findHtmlFiles(top, depthfirst=False):
    
    from xml.sax.saxutils import escape # To quote out things like &amp;
    ret = ['\n']

    for top, names in walktree(top):
      if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
         # Prints out the directories found -- testing for above condition
         #print top + "<br />\n"
         for name in names:
               if name.endswith('.htm') or name.endswith('.HTM'):
                  cygDrivePath = top
                  # Put into Windows file format for running batch file by double clicking in Windows Explorer
                  windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
                  #windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
                  #windowsDrivePath = windowsDrivePath.replace("/","\\")
                  # the last slash should be dependant on whether windowsDrivePath or cygDrivePath
                  
                  if cygDrivePath.endswith('/'):
                      html = cygDrivePath + name
                  else:
                      html = cygDrivePath + "/" + name
                  
                  if html.find("/cygdrive/z/") >= 0:
                      html = html.replace("/cygdrive/z/","http://clf2dev/")
                  elif html.find("/cygdrive/e/") >= 0:
                      html = html.replace("/cygdrive/e/","http://preweb/")
                  
                  if windowsDrivePath.endswith('\\'):
                      ret.append("<a href=\"" + html + "\">" + html + "</a><br />\n")
                  else:
                      ret.append("<a href=\"" + html + "\">" + html + "</a><br />\n")
    return ''.join(ret) # Much faster than += method

def makeHtmlFile(top, depthfirst=False):
        if len(sys.argv) < 2 or len(sys.argv) > 3:  # the program name and the one arguments
            # stop the program and print an error message
            sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) /path/to/arv/file/\nIf no path to scan folder is specified, current path is used")
        filePath = '/cygdrive/c/'
        openFile = open(filePath + 'toQA.html','w+')
        openFile.write(''.join([findHtmlFiles(top, depthfirst)]))
        openFile.close()
        os.chmod(filePath + '/toQA.html',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
        print "File permissions on HTML file changed successfully\n"
        print "HTML QA file successfully created at " + filePath + "toQA.html\n"

#/////////////////////////////////////////////////
# End HTML file creation 
#/////////////////////////////////////////////////


#/////////////////////////////////////////////////
# Start Tidy batch file creation
#/////////////////////////////////////////////////

def searchTidyBatch(top, depthfirst=False):
    tidyString = """tidy -i -w -q -m -asxhtml --quote-marks yes --drop-empty-paras yes --indent yes --indent-spaces 3 --wrap 0 --doctype strict --char-encoding latin1 --logical-emphasis yes --drop-font-tags yes --join-classes no --merge-divs no --word-2000 yes """
    from xml.sax.saxutils import escape # To quote out things like &amp;
    ret = [':: Tidy batch file creation command line tool, written by Andrew Hinde for the CRTC\n']

    for top, names in walktree(top):
      if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
         # Prints out the directories found -- testing for above condition
         #print top + "<br />\n"
         for name in names:
               if name.endswith('.htm') or name.endswith('.HTM'):
                  cygDrivePath = top
                  windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
                  # Put into Windows file format for running batch file by double clicking in Windows Explorer
                  windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
                  windowsDrivePath = windowsDrivePath.replace("/","\\")
                  # the last slash should be dependant on whether windowsDrivePath or cygDrivePath
                  if windowsDrivePath.endswith('\\'):
                      ret.append(tidyString  + "\"" + windowsDrivePath + name + "\"\n")
                  else:
                      ret.append(tidyString  + "\"" + windowsDrivePath + "\\" + name + "\"\n")
    return ''.join(ret) # Much faster than += method

def writeTidyBatch(top, depthfirst=False):
        if len(sys.argv) < 1 or len(sys.argv) > 2:  # the program name and the one arguments
            # stop the program and print an error message
            sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) \nIf no path to scan folder is specified, current path is used")
        filePath = '/cygdrive/c/'
        openFile = open(filePath + 'toTidy.bat','w+')
        print "File created successfully : " + filePath + "toTidy.bat\n"
        openFile.write('\n'.join([searchTidyBatch(top, depthfirst)]))
        print "File written successfully\n"
        openFile.close()
        os.chmod(filePath + '/toTidy.bat',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
        print "File permissions changed successfully\n"
        print "Tidy batch file successfully created.\n"

#/////////////////////////////////////////////////
# End Tidy batch file creation
#/////////////////////////////////////////////////


#/////////////////////////////////////////////////
# Start extra cleaning of documents
#/////////////////////////////////////////////////

def cleanDocs(top, depthfirst=False):
    from xml.sax.saxutils import escape # To quote out things like &amp;
    ret = ['']
    global cantOpen
    for top, names in walktree(top):
      if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
         for name in names:
               if name.endswith('.htm') or name.endswith('.HTM'):
                  cygDrivePath = top
                  windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
                  # Put into Windows file format for running batch file by double clicking in Windows Explorer
                  windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
                  windowsDrivePath = windowsDrivePath.replace("/","\\")
                  # the last slash should be dependant on whether windowsDrivePath or cygDrivePath
                  try:
                      if windowsDrivePath.endswith('\\'):
                          ioFile = open(windowsDrivePath + name, "r+")
                      else:
                          ioFile = open(windowsDrivePath + "\\" + name, "r+")
                  except IOError:
                      # Usually the case if the file is set at Read Only
                      # Future upgrade: chmod the file to allow for writing
                      print "Cannot open file!"
                      cantOpen += 1
                      continue
                  except OSError:
                      print "File not found!"
                      continue
                  else:
                      fileContents = ioFile.read()
                      ioFile.close()
                  
                  #width="" and height="" regex
                  reWidthHeight = r"(?i)(width|height)=\"[\d]+\%?\""
                  fileContents = re.sub(reWidthHeight,"",fileContents)

                  #MS if/end if statements regex
                  reMsIfTags = r"(?i)\<\!(--)?\[(end)?if[^\>]+>"
                  fileContents = re.sub(reMsIfTags,"",fileContents)

                  #Underline tags regex
                  reUnderline = r"(?i)<[\/]?u>"
                  fileContents = re.sub(reUnderline,"",fileContents)

                  #Bordercolor and bgcolor removal regex
                  reBordercolor = r"(?i)(bgcolor|bordercolor)=\"\#[a-fA-F0-9]{3,6}\""
                  fileContents = re.sub(reBordercolor,"",fileContents)
                  
                  #Ordered list "start" attributes
                  reStartAttr = r"(?i) start=\"[^\"]+\""
                  fileContents = re.sub(reStartAttr,"",fileContents)
                  
                  # Language="Javascript" attribute on <script> tags
                  reLangAttr = r"(?i) language=(\")?javascript(\")?"
                  fileContents = re.sub(reLangAttr,"",fileContents)

                  #MS tags with semicolon in them regex
                  reMStags = r"(?i)<[\/]?[a-zA-Z0-9]+:[^>]+>"
                  fileContents = re.sub(reMStags,"",fileContents)

                  # Non-regex search and replaces
                  fileContents = fileContents.replace("src=\"/clf20/images/tphp.gif\" alt=\"\"","src=\"/clf20/images/tphp.gif\" width=\"19\" height=\"12\" alt=\"\"")
                  fileContents = fileContents.replace(" style=\"margin-left: 2em\""," class=\"indent1\"")

                  
                  #    Apparently getting rid of inline CSS messes up EVERYTHING, then people complain
                  #    so I took it out of the script to keep everyone happy, and documents looking like they did
                  #    before, although still terribly coded
                  # Style attributes
                  #reStyle =  r"(?i) style=\"[^\"]+\""
                  #fileContents = re.sub(reStyle,"",fileContents)
                  
                  # For all ../ before the clf20 folder (CSS, images, etc)
                  reDotDotSlash = r"(?i)(\.\.\/)+clf20\/"
                  fileContents = re.sub(reDotDotSlash,"/clf20/",fileContents)
                  
                  # For all XML declarations at the beginning of the page
                  XML = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>"
                  fileContents = fileContents.replace(XML,"")

                  if windowsDrivePath.endswith('\\'):
                      updatedFile = open(windowsDrivePath + name, "w+")
                  else:
                      updatedFile = open(windowsDrivePath + "\\" + name, "w+")                 
                  updatedFile.write(fileContents)
                  updatedFile.close()
                  print "Finished extra cleanup on " + name

#/////////////////////////////////////////////////
# End extra cleaning of documents
#/////////////////////////////////////////////////



def runTidyBatch():
    os.system("/cygdrive/c/toTidy.bat")
    
def fileCount(top, depthfirst=False):
    htmCount = 0
    for top, names in walktree(top):
        if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
            for name in names:
                if name.endswith('.htm') or name.endswith('.HTM'):
                    htmCount += 1
    return htmCount

if __name__ == '__main__':
    #the 2nd argument is the path of the folder to be searched, but only if there are 3 arguments in the command
    if len(sys.argv) == 2:
        top = sys.argv[1]
    else: top = '.'
    startTime = datetime.now()
    
    # Start multithreading for quicker runtimes
    intFileCount = thread.start_new_thread(fileCount,(top,))
    thread.start_new_thread(makeArvFile,(top,))
    thread.start_new_thread(makeHtmlFile,(top,))
    
    writeTidyBatch(top)
    runTidyBatch()
    os.system("clear")
    cleanDocs(top)
    endTime = datetime.now()
    timeDiff = endTime - startTime
    print "Total time taken : " + str(timeDiff) + " on " + str(fileCount(top)) + " files"
    if cantOpen > 0:
        print str(cantOpen) + " files could not be opened to run regular expressions on"
    print "\n"


    
    
    
    
    
    

At work, I had the need for an HTML Tidy script that did a little extra, and that could traverse through directories recursively, so I decided to take the opportunity to learn Python as well.

This script will take a given directory (or current directory if none is given) and list all .htm files (we don't use html files at work), then it will create a batch file to run HTML Tidy, as well as a file to check validation using A Real Validator, and an HTML file to easily do a QA on larger batches. It then goes through a list of about 10 other regular expressions to clean up extra code that Tidy doesn't take care of.

So far, benchmarks show that it can do roughly 5000 pages per hour. Obviously larger pages with more validation issues will cause it to run longer.

This script is very customized towards Government of Canada Common Look and Feel 2, and it was also written using Cygwin Bash Shell for Windows with Python installed.