#!/usr/bin/python

# $Id: check_license.py,v 1.22 2009-02-26 17:46:16 potyra Exp $
#
# Copyright (C) 2009 FAUmachine Team <info@faumachine.org>.
# This program is free software. You can redistribute it and/or modify it
# under the terms of the GNU General Public License, either version 2 of
# the License, or (at your option) any later version. See COPYING.

import sys
import string
import re
import os
import struct

# number of lines scanned in source files
SOURCE_LINES=50

# don't traverse to any directories named like these
BLACKLISTED_DIRS = (
			# cvs specific
			"CVS", 
			# created by automake
			".deps"
			)
# skip these exact directories
BLACKLISTED_PATHDIRS= (
			# generated by autoconf
			"./autom4te.cache",
			# FIXME 
			"./experiments",
			# FIXME
			"./scripts/test-FAUmachine/test-engine/templates",
			# FIXME
			"./node-pc/keymaps"
			)
# if a file named like this appear in any directory, skip it.
BLACKLISTED_FILES = (	
			# cvs specific
			".cvsignore", 
			# must have at least 2 files in this list
			"README"
			)
# skip checking files with the exact path
BLACKLISTED_PATHFILES = (
			# that's the destination file
			"./copyright",
			# only an informational text file
			"./AUTHORS",
			# our changelog file
			"./NEWS",
			# informational text file
			"./README.bsd",
			# informational text file
			"./README.macosx",
			# informational text file
			"./STATE",
			# informational text file
			"./TODO",
			# GPL itself
			"./COPYING",
			# LGPL itself
			"./COPYING.LIB",
			# generated, empty
			"./stamp-h1",
			# informational text file
			"./INSTALL",
			# installed by autotools automatically
			"./scripts/install-sh",
			# informational text file
			"./doc/IO_PORTS.txt",
			# informational text file
			"./doc/CODINGSTYLE",
			# TODO can dekstop files have comments?
			"./doc/faum.desktop"
			)

class CopyrightHolder:
	def __init__(self, firstname, lastname, email, year1, year2):
		self._firstname = firstname
		self._lastname = lastname
		self._email = email

		if year1 is not None:
			self._year1 = int(year1)
		else:
			self._year1 = None

		if year2 is not None:
			self._year2 = int(year2)
		else:
			self._year2 = None

	def isFAUmachineTeam(self):
		return     (self._firstname == "FAUmachine") \
		       and (self._lastname == "Team")

	def __str__(self):
		s = "Copyright (c)"
		if self._year1 is not None:
			s += " %d" % self._year1
		if self._year2 is not None:
			s +="-%d" % self._year2
		s += " by"
		if self._firstname is not None:
			s += " %s" % self._firstname
		if self._lastname is not None:
			s += " %s" % self._lastname
		if self._email is not None:
			s += " %s" % self._email

		return s

	def __cmp__(self, other):
		# two holders are identical, if first and last name match
		# also sort by lastname then firstname
		if self._lastname != other._lastname:
			return cmp(self._lastname, other._lastname)
		return cmp(self._firstname, other._firstname)

	def __hash__(self):
		s = self._lastname + self._firstname
		return hash(s)


class LicensedFileBase:
	""" base class for all licensed files """

	def __init__(self, path):
		""" c'tor. Path: path to file to check """
		# path to file
		self._path = path
		# copyright holders list
		self._holders = []
		# license shortcut
		self._shortcut = ""
		# is it a generated file?
		self._generated = False

	def getPath(self):
		""" get pathname to file """
		return self._path

	def isStandard(self):
		""" is this file authored solely by FAUmachine AUTHORS and 
		    distributable under GPL-2+?
		"""
		if     (self._shortcut == "GPL-2+") \
		   and (len(self._holders) == 1) \
		   and (self._holders[0].isFAUmachineTeam()):
		   	return True

		return False

	def getLicenseShortcut(self):
		""" returns the license shortcut (if any)
		"""
		return self._shortcut

	def isGenerated(self):
		""" returns True if the file is generated through another file
		"""
		return self._generated
	
	def _process(self, txt):
		""" set all members by evaluating the textual license txt
		"""
		self._findHolders(txt)

		if LicensedFileBase._isGPL2P(txt):
			self._shortcut = "GPL-2+"
		elif LicensedFileBase._isLGPL2P(txt):
			self._shortcut = "LGPL-2+"
		else:
			self._shortcut = "unknown"

		self._generated = LicensedFileBase._isGenerated(txt)

	def _findHolders(self, txt):
		crLine = """
			[Cc]opyright[ ]
			\([cC]\)
			(?:[ ](?P<year1>[\d]+))?
			(?:-(?P<year2>[\d]+))?
			(?:[ ]by)?
			[ ]
			(?P<firstname>[\w]+)
			[ ]
			(?P<lastname>[\w]+)
			(?:[ ](?P<email><[\w]+@[\w.]+>))?
			"""
		p = re.compile(crLine, re.VERBOSE)

		for m in p.finditer(txt):
			h = CopyrightHolder(**m.groupdict())
			self._holders.append(h)

	def __cmp__(self, other):
		""" comparison method """
		if self._shortcut != other._shortcut:
			return cmp(self._shortcut, other._shortcut)

		l1 = self._holders[:]
		l2 = other._holders[:]
		l1.sort()
		l2.sort()
		return cmp(l1, l2)

	def __str__(self):
		#s = "%s: %s\n" % (self._path, self._shortcut)
		s = ""
		for h in self._holders:
			s += "  %s\n" % h
		s += "  License: %s." % self._shortcut
		return s

	def __hash__(self):
		h1 = hash(self._shortcut)
		for l in self._holders:
			h1 ^= hash(l)

		return h1

	@staticmethod
	def _isGPL2P(txt):
		t1 = r"GNU General Public License"
		t2 = r"either version 2 of"
		t3 = r"or (at your option) any later version"

		if (t1 in txt) and (t2 in txt) and (t3 in txt):
			return True

		return False

	@staticmethod
	def _isLGPL2P(txt):
		t1 = r"GNU " + "Lesser General Public License"
		t2 = r"either version 2 of"
		t3 = r"or (at your option) any later version."

		if (t1 in txt) and (t2 in txt) and (t3 in txt):
			return True

		return False

	@staticmethod
	def _isGenerated(txt):
		gr = r"([gG]enerated (?:by|from)|Generator:){1}"
		m = re.search(gr, txt)

		return m is not None

	@staticmethod
	def _sanitizeText(txt):
		tt = string.maketrans("a", "a")
		delchars = "#/*"
		t = txt.translate(tt, delchars)

		# replace all subsequent whitespace with one space.
		m = r"[\s]+"
		p = re.compile(m)
		t = p.sub(" ", t)

		return t

	


class LicensedTextFile(LicensedFileBase):
	""" class for all textual files """
	def __init__(self, path):
		LicensedFileBase.__init__(self, path)
		self._scan()

	def _scan(self):
		f = file(self._path, "r")
		txt = []
		for i in range(1, SOURCE_LINES):
			txt.append(f.readline())
		f.close()

		txt = " ".join(txt)
		txt = LicensedFileBase._sanitizeText(txt)
		self._process(txt)

class LicensedPNGFile(LicensedFileBase):
	""" class for PNG files """
	def __init__(self, path):
		LicensedFileBase.__init__(self, path)
		self._scan()

	def _scan(self):
		""" take the text from the "Comment" section of texts
		    stored in the PNG file
		"""
		txt = ""
		f = file(self._path, "r")
		b = f.read(4)
		if len(b) < 4:
			raise Exception("%s is not a PNG file", self._path)

		if (ord(b[0]) != 0x89) or (b[1:4] != "PNG"):
			raise Exception("%s is not a PNG file", self._path)

		# skip remaining header bytes
		f.seek(4, os.SEEK_CUR);

		while(True):
			# first 32 bit field: length of chunk
			length = f.read(4)
			if len(length) < 4:
				raise Exception("Corrupt file %s", self._path)

			# unpack uint32_t (big endian)
			length = struct.unpack(">L", length)
			assert len(length) == 1
			length = length[0]
			
			# last field: length==0
			if length == 0:
				break

			# type of chunk
			t = f.read(4)
			if len(t) < 4:
				raise Exception("Corrupt file %s", self._path)

			if t != "tEXt":
				# unintersting, skip this chunk
				# also skip crc32 at end of chunk
				f.seek(length + 4, os.SEEK_CUR)
				continue

			# it is a tEXt chunk
			txt += LicensedPNGFile._readtEXtChunk(f, length)

			# skip crc 32
			f.seek(4, os.SEEK_CUR)

		f.close()
		self._process(txt)

	@staticmethod
	def _readtEXtChunk(f, length):
		comments = {}
		i = 0
		mode = 0
		sec = ""
		txt = ""

		# text chunks are in the form (key\0value)+
		while (i < length):
			i += 1
			c = f.read(1)
			assert (len(c) == 1)
			if mode == 0:
				if ord(c) == 0:
					mode = 1
					continue
				sec += c
			elif mode == 1:
				if ord(c) == 0:
					mode = 0
					comments[sec] = txt
					sec = ""
					txt = ""
					continue
				txt += c

		# last comment not yet added
		comments[sec] = txt
		# check if there is a "Comment" key and return it.
		if comments.has_key("Comment"):
			return comments["Comment"]

		# default: don't look at other sections
		return ""

class FileRegistry:
	""" keep track of scanned files
	"""
	def __init__(self):
		# plain list with LicensedFileBase of all checked files
		self._files = []
		# dictionary key: file value: list of files
		self._filedict = {}

	def checkFile(self, path):
		""" check/register file with given path
		"""

		lf = None
		if FileRegistry.isTextFile(path):
			lf = LicensedTextFile(path)
		elif FileRegistry.isPNGFile(path):
			lf = LicensedPNGFile(path)
		else:
			#print "Warning: Not checking %s" % path
			return

		if not lf.isGenerated():
			self._files.append(lf)
			if self._filedict.has_key(lf):
				self._filedict[lf].append(lf)
			else:
				self._filedict[lf] = [ lf ]

	def traverse(self):
		for root, dirs, files in os.walk("."):
			for b in BLACKLISTED_DIRS:
				if b in dirs:
					dirs.remove(b)

			for p in BLACKLISTED_PATHDIRS:
				for b in dirs[:]:
					if os.path.join(root, b) == p:
						dirs.remove(b)

			for f in BLACKLISTED_FILES:
				if f in files:
					files.remove(f)

			for p in BLACKLISTED_PATHFILES:
				for f in files[:]:
					if os.path.join(root, f) == p:
						files.remove(f)

			for f in files:
				p = os.path.join(root, f)
				self.checkFile(p)

	def __str__(self):
		ret = ""
		# FIXME need to find minimum/maximum year bounds
		for key, valuelist in sorted(fr._filedict.iteritems()):
			prefixes = ["* "]
			prefixes += ["  " for x in valuelist[1:]]

			suffixes = (["," for x in valuelist[:-1]])
			suffixes.append(":")
			valuelist.sort()

			for p, i, s in zip(prefixes, valuelist, suffixes):
				ret += "%s%s%s\n" % (p,i.getPath(),s)
			ret += "\n"
			ret += str(key)
			ret += "\n\n"
		return ret

	def filterOutDefault(self, firstname, lastname, license):
		lf = LicensedFileBase("dummy")
		lf._holders.append(
			CopyrightHolder(firstname, lastname, "", 0, 0))
		lf._shortcut = license

		if self._filedict.has_key(lf):
			del(self._filedict[lf])

	@staticmethod
	def isTextFile(path):
		# quick filter for binary files.
		if path[-2:] == ".o":
			return False
		if path[-2:] == ".a":
			return False
		if path[-4:] == ".rom":
			return False
		if path[-4:] == ".png":
			return False

		# tiny magic: check for ELF magic bytes
		f = file(path, "r")
		b = f.read(4)
		f.close()
		if (len(b) >= 4):
			if (ord(b[0]) == 0x7F) and (b[1:4] == "ELF"):
				return False

		return True

	@staticmethod
	def isPNGFile(path):
		""" is path a PNG file?
		"""
		if path[-4:] != ".png":
			return False

		f = file(path, "r")
		b = f.read(4)
		f.close()
		if (len(b) >= 4):
			if (ord(b[0]) == 0x89) and (b[1:4] == "PNG"):
				return True

		return False




if __name__ == '__main__':
	fr = FileRegistry()
	fr.traverse()
	fr.filterOutDefault("FAUmachine", "Team", "GPL-2+")
	print fr
