#!/usr/bin/env python

from __future__ import nested_scopes, generators, division, absolute_import, with_statement, print_function

# xls.py - Provides the same API for reading CSV, XLS and XLSX spreadsheet files
# 20200422 raf <raf@raf.org>

'''
xls - Provides the same API for reading CSV, XLS and XLSX spreadsheet files
20200422 raf <raf@raf.org>

This module provides the reader() function which returns an iterator over
the rows contained in either a CSV file or the first worksheet (by default)
of an XLS or XLSX file whose name is given as an argument. If an additional
zero-based index argument is also supplied, the rows of the corresponding
worksheet are iterated over. If the index argument is -1, all rows of all
worksheets will be iterated over with a simulated blank row between each
worksheet.

It also provides the read() function which returns the parsed contents
of a CSV file or all the worksheets of an XLS or XLSX file as an array
of worksheets.

Single worksheet example (or multiple worksheets as a single worksheet):

	import xls
	for row in xls.reader('example.xls', i=0):
		print('%r' % row)
	xls.reader_finish('example.xls') # Only needed for xlsx files on Windows

Multiple worksheet example:

	import xls
	worksheets = xls.read('example.xls')
	for worksheet in worksheets:
		for row in worksheet:
			print('%r' % row)

'''

import os, csv, xlrd, openpyxl, re, decimal, datetime

_reader_books = {}

def reader(fname, i=0):
	'''If fname ends in ".csv" or ".txt", return a CSV reader for the
	corresponding file. If fname ends in ".xlsx", return an equivalent
	XLSX reader. If dname ends in ".xls", return an equivalent XLS reader.
	Otherwise raise an exception.

	For XLS and XLSX files:
	 - If i is supplied, it is the zero-based index of the worksheet to read.
	 - If i is negative, all sheets are read (each separated by an empty row).
	 - All cell values are returned as strings (consistent with the csv module).
	 - Numbers are rounded according to their display format.

	 For XLSX files, the function reader_finish() should be called when finished
	 reading the file because the underlying openpyxl module leaves these files
	 open which will hamper attempts to delete or rename or move the file under
	 Windows. This isn't necessary when using the read() function which handles
	 explicitly closing XLSX files by itself.
	 '''
	if fname[-4:].lower() in ['.csv', '.txt']:
		convert_charset(fname)
		return _csv_iter(open(fname, 'rb'))
	if fname[-5:].lower() in ['.xlsx']:
		return _xlsx_iter(fname, i)
	if fname[-4:].lower() in ['.xls']:
		return _xls_iter(fname, i)
	raise Exception('xls.reader(): Unexpected filename: %s' % fname)

def reader_finish(fname):
	'''If fname ends in ".xlsx", handle the fact that openpyxl leaves
	the file open after reading and so the file can't be renamed afterwards
	on Windows if the client wants to. Exclicitly close the workbook.
	For other files, this does nothing. This only needs to be called after
	using the reader() function to read an XLSX file if there is a need
	to delete or rename the file on Windows afterwards. If it not needed
	after reading an XLSX file with the read() function.'''
	if fname[-5:].lower() != '.xlsx':
		return
	if fname not in _reader_books:
		return
	book = _reader_books[fname]
	del _reader_books[fname]
	try:
		book.close()
	except:
		pass

def read(fname):
	'''If fname ends in ".csv" or ".txt", parses the corresponding file as
	a CSV file. If fname ends in ".xlsx", it is parsed as an XLSX file. If
	fname ends in ".xls", it is parsed as an XLS file. Otherwise raise an
	exception.
	The data is returned as a list of lists of lists of cells. As with xls.reader():
	 - All cell values are returned as strings (consistent with the csv module).
	 - Numbers are rounded according to their display format.
	'''
	if fname[-4:].lower() in ['.csv', '.txt']:
		convert_charset(fname)
		return [[_ for _ in _csv_iter(open(fname, 'rb'))]]
	if fname[-5:].lower() in ['.xlsx']:
		book = openpyxl.load_workbook(fname, read_only=True, data_only=True)
		rows = [[_ for _ in _xlsx_iter(fname, i, book)] for i in range(len(book.sheetnames))]
		reader_finish(fname)
		return rows
	if fname[-4:].lower() in ['.xls']:
		book = xlrd.open_workbook(fname, formatting_info=True, logfile=_devnull)
		book.has_formatting = True
		return [[_ for _ in _xls_iter(fname, i, book)] for i in range(book.nsheets)]
	raise Exception('xls.read(): Unexpected filename: %s' % fname)

_devnull = open('/dev/null' if os.path.exists('/dev/null') else 'nul:', 'wb')

def _csv_iter(f):
	'''Like csv.reader() but assumes Latin-1 and decodes to unicode if non-ASCII.'''
	#default_charset = locale.getdefaultlocale()[1] or 'ISO8859-1'
	default_charset = 'ISO8859-1'
	for row in csv.reader(f):
		for i in xrange(len(row)):
			if len(row[i].translate(None, ''.join([chr(_) for _ in range(128)]))):
				row[i] = row[i].decode(default_charset, 'replace')
		yield row

def _xls_iter(fname, i=0, book=None):
	'''Returns an iterator over the rows in the given worksheet(s) of the given XLS file.
	If book is None, it is opened.'''
	if book is None:
		try:
			book = xlrd.open_workbook(fname, formatting_info=True, logfile=_devnull)
			book.has_formatting = True
		except NotImplementedError:
			book = xlrd.open_workbook(fname, formatting_info=False, logfile=_devnull)
			book.has_formatting = False
	def iterator():
		for j in [i] if i >= 0 else range(book.nsheets):
			# Separate multiple worksheets with a blank line
			if i < 0 and j > 0:
				yield []
			sheet = book.sheet_by_index(j)
			for r in range(sheet.nrows):
				yield _xls_getrow(book, sheet, r)
	return iterator()

def _xlsx_iter(fname, i=0, book=None):
	'''Returns an iterator over the rows in the given worksheet(s) of the given XLSX file.
	If book is None, it is opened.'''
	if book is None:
		book = openpyxl.load_workbook(fname, read_only=True, data_only=True)
	# Save the book so we can explicitly close it after reading the file
	_reader_books[fname] = book
	def iterator():
		for j in [i] if i >= 0 else range(len(book.sheetnames)):
			# Separate multiple worksheets with a blank line
			if i < 0 and j > 0:
				yield []
			sheet = book[book.sheetnames[j]]
			r = 0
			for row in sheet.iter_rows(values_only=False):
				yield _xlsx_getrow(book, sheet, row, r)
				r += 1
	return iterator()

def _xls_getrow(book, sheet, r):
	'''Translate the given XLS row into text.'''
	row = []
	for typ, value in zip(sheet.row_types(r), sheet.row_values(r)):
		# Types:
		# 0 = empty u''
		# 1 = unicode text
		# 2 = float (convert to int if possible, then convert to string)
		# 3 = date (convert to unambiguous date/time string)
		# 4 = boolean (convert to string "0" or "1")
		# 5 = error (convert from code to error text)
		# 6 = blank u''
		if typ == 2:
			if book.has_formatting:
				xf_index = sheet.cell_xf_index(r, len(row))
				xf = book.xf_list[xf_index]
				fmt = book.format_map[xf.format_key]
				match = re.compile('\.(0+)').search(fmt.format_str)
				if match is not None:
					digits = len(match.group(1))
					value = decimal.Decimal(str(value)).quantize(decimal.Decimal((0, (1,), -digits)), rounding=decimal.ROUND_HALF_UP)
			else:
				# Note: When reading xlsx files, formatting_info is not
				# present so we don't know where to round floating point
				# numbers to so we don't. This may not be acceptable but it
				# shouldn't really matter for our purposes as most files
				# that we import contain manually entered values.
				# However, that's not always true so this does matter.
				# This is not safe for use with .xlsx files.
				value = decimal.Decimal(str(value))
			if value == int(value):
				value = int(value)
			value = str(value)
		elif typ == 3:
			try:
				# A date value of 1.0 means midnight (i.e. "24:00:00" which appears in-cell as "00:00:00").
				# This is seen as ambiguous by xlrd (i.e. start or end of the day, presumably?).
				# But we want to see "00:00:00" so we change it to 0.0 before getting the tuple.
				if value == 1.0:
					value = 0.0
				t = xlrd.xldate_as_tuple(value, book.datemode)
				# No date, just time
				if t[0] == 0 and t[1] == 0 and t[2] == 0:
					value = "%02d:%02d:%02d" % t[3:6]
				# No time, just date
				elif t[3] == 0 and t[4] == 0 and t[5] == 0:
					value = "%04d-%02d-%02d" % t[0:3]
				# Date and time
				else:
					value = "%04d-%02d-%02dT%02d:%02d:%02d" % t
			except xlrd.xldate.XLDateNegative:
				value = 'Negative Date: %s' % value
			except xlrd.xldate.XLDateAmbiguous:
				value = 'Ambiguous Date: %s' % value
			except xlrd.xldate.XLDateTooLarge:
				value = 'Too Large Date: %s' % value
			except xlrd.xldate.XLDateBadDatemode:
				value = 'Invalid Date Mode: (%s, %s)' % (value, book.datemode)
		elif typ == 4:
			value = str(value)
		elif typ == 5:
			value = xlrd.error_text_from_code[value]
		row.append(value)
	# Strip trailing empty cells
	while len(row) and row[-1] == '':
		row = row[:-1]
	return row

def _xlsx_getrow(book, sheet, cellrow, r):
	'''Translate the given XLSX row into text.'''
	row = []
	for cell in cellrow:
		typ, value, fmt = cell.data_type, cell.value, cell.number_format
		# Types:
		# 's' = unicode text
		# 'd' = datetime.datetime or datetime.time
		# 'n' = number (int or float)
		# 'e' = error
		if value is None:
			value = ''
		elif typ == 's': # String
			pass # value is already a string
		elif typ == 'd': # Date and/or time (ignore the 3 added microseconds)
			# No date, just time
			if isinstance(value, datetime.time):
				value = '%02d:%02d%s' % (value.hour, value.minute, (':%02d' % value.second) if 'S' in fmt else '')
			# Date and time, or just date
			elif isinstance(value, datetime.datetime):
				if value.hour or value.minute or value.second:
					value = "%04d-%02d-%02dT%02d:%02d%s" % (value.year, value.month, value.day, value.hour, value.minute, (':%02d' % value.second) if 'S' in fmt else '')
				else: # Just a date returned as a datetime.datetime
					value = "%04d-%02d-%02d" % (value.year, value.month, value.day)
		elif typ == 'n': # Number
			match = re.compile('\.(0+)').search(fmt)
			if match is not None:
				digits = len(match.group(1))
				value = decimal.Decimal(str(value)).quantize(decimal.Decimal((0, (1,), -digits)), rounding=decimal.ROUND_HALF_UP)
			if value == int(value):
				val = int(value)
			value = str(value)
		elif typ == 'e' : # Error
			pass # Value is something like '#DIV/0!' or '#VALUE!' or '#REF!'
		else:
			#import sys
			#print('xlsx: Unexpected cell typ=%r val=%r fmt=%r' % (typ, value, fmt), file=sys.stderr)
			if not isinstance(value, basestring):
				value = str(value)
		row.append(value)
	# Strip trailing empty cells
	while len(row) and row[-1] == '':
		row = row[:-1]
	return row

def convert_charset(fname):
	'''Convert the file with the given name from UTF32, UTF16 or UTF8-with-BOM to Latin-1.'''
	f = open(fname, 'rb')
	bytes = f.read()
	f.close()
	converted = None
	if bytes[0:4] == '\x00\x00\xfe\xff': # UTF-32, big-endian
		converted = bytes.decode('utf-32').encode('iso-8859-1', 'replace')
	elif bytes[0:4] == '\xff\xfe\x00\x00': # UTF-32, little-endian
		converted = bytes.decode('utf-32').encode('iso-8859-1', 'replace')
	elif bytes[0:2] == '\xfe\xff': # UTF-16, big-endian
		converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace')
	elif bytes[0:2] == '\xff\xfe': # UTF-16, little-endian
		converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace')
	elif bytes[0:3] == '\xef\xbb\xbf': # utf8
		converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace')
	if converted is not None:
		f = open(fname, 'wb')
		f.write(converted)
		f.close()

#ifdef TEST
if __name__ == '__main__':
	def _csv_quote(value):
		'''Return value quoted as needed for inclusion in a CSV file.'''
		if value is None:
			return ''
		if not isinstance(value, basestring):
			value = str(value)
		return '"' + value.replace('"', '""') + '"' if any([_ in value for _ in '",\r\n\v\f']) else value

	def main():
		'''Test reader() by reading the csv or xls or xlsx file whose name is given
		on the command line and printing out its contents in csv format.
		It is up to the user to verify that it really worked.'''
		import sys
		if len(sys.argv) not in [2, 3]:
			print('usage: %s filename [sheetnumber (1-based)]' % sys.argv[0])
			sys.exit(1)
		i = int(sys.argv[2]) - 1 if len(sys.argv) == 3 else -1
		for row in reader(sys.argv[1], i):
			print('%s' % ','.join([_csv_quote(unicode(cell.strip()).encode('utf8')) for cell in row]))

	main()
#endif TEST

# vi:set ts=4 sw=4: