#!/usr/bin/env python from __future__ import nested_scopes, generators, division, absolute_import, with_statement, print_function # xls.py - Provides the same API for reading CSV, XLS and XLSX spreadsheet files # 20200422 raf ''' xls - Provides the same API for reading CSV, XLS and XLSX spreadsheet files 20200422 raf This module provides the reader() function which returns an iterator over the rows contained in either a CSV file or the first worksheet (by default) of an XLS or XLSX file whose name is given as an argument. If an additional zero-based index argument is also supplied, the rows of the corresponding worksheet are iterated over. If the index argument is -1, all rows of all worksheets will be iterated over with a simulated blank row between each worksheet. It also provides the read() function which returns the parsed contents of a CSV file or all the worksheets of an XLS or XLSX file as an array of worksheets. Single worksheet example (or multiple worksheets as a single worksheet): import xls for row in xls.reader('example.xls', i=0): print('%r' % row) xls.reader_finish('example.xls') # Only needed for xlsx files on Windows Multiple worksheet example: import xls worksheets = xls.read('example.xls') for worksheet in worksheets: for row in worksheet: print('%r' % row) ''' import os, csv, xlrd, openpyxl, re, decimal, datetime _reader_books = {} def reader(fname, i=0): '''If fname ends in ".csv" or ".txt", return a CSV reader for the corresponding file. If fname ends in ".xlsx", return an equivalent XLSX reader. If dname ends in ".xls", return an equivalent XLS reader. Otherwise raise an exception. For XLS and XLSX files: - If i is supplied, it is the zero-based index of the worksheet to read. - If i is negative, all sheets are read (each separated by an empty row). - All cell values are returned as strings (consistent with the csv module). - Numbers are rounded according to their display format. For XLSX files, the function reader_finish() should be called when finished reading the file because the underlying openpyxl module leaves these files open which will hamper attempts to delete or rename or move the file under Windows. This isn't necessary when using the read() function which handles explicitly closing XLSX files by itself. ''' if fname[-4:].lower() in ['.csv', '.txt']: convert_charset(fname) return _csv_iter(open(fname, 'rb')) if fname[-5:].lower() in ['.xlsx']: return _xlsx_iter(fname, i) if fname[-4:].lower() in ['.xls']: return _xls_iter(fname, i) raise Exception('xls.reader(): Unexpected filename: %s' % fname) def reader_finish(fname): '''If fname ends in ".xlsx", handle the fact that openpyxl leaves the file open after reading and so the file can't be renamed afterwards on Windows if the client wants to. Exclicitly close the workbook. For other files, this does nothing. This only needs to be called after using the reader() function to read an XLSX file if there is a need to delete or rename the file on Windows afterwards. If it not needed after reading an XLSX file with the read() function.''' if fname[-5:].lower() != '.xlsx': return if fname not in _reader_books: return book = _reader_books[fname] del _reader_books[fname] try: book.close() except: pass def read(fname): '''If fname ends in ".csv" or ".txt", parses the corresponding file as a CSV file. If fname ends in ".xlsx", it is parsed as an XLSX file. If fname ends in ".xls", it is parsed as an XLS file. Otherwise raise an exception. The data is returned as a list of lists of lists of cells. As with xls.reader(): - All cell values are returned as strings (consistent with the csv module). - Numbers are rounded according to their display format. ''' if fname[-4:].lower() in ['.csv', '.txt']: convert_charset(fname) return [[_ for _ in _csv_iter(open(fname, 'rb'))]] if fname[-5:].lower() in ['.xlsx']: book = openpyxl.load_workbook(fname, read_only=True, data_only=True) rows = [[_ for _ in _xlsx_iter(fname, i, book)] for i in range(len(book.sheetnames))] reader_finish(fname) return rows if fname[-4:].lower() in ['.xls']: book = xlrd.open_workbook(fname, formatting_info=True, logfile=_devnull) book.has_formatting = True return [[_ for _ in _xls_iter(fname, i, book)] for i in range(book.nsheets)] raise Exception('xls.read(): Unexpected filename: %s' % fname) _devnull = open('/dev/null' if os.path.exists('/dev/null') else 'nul:', 'wb') def _csv_iter(f): '''Like csv.reader() but assumes Latin-1 and decodes to unicode if non-ASCII.''' #default_charset = locale.getdefaultlocale()[1] or 'ISO8859-1' default_charset = 'ISO8859-1' for row in csv.reader(f): for i in xrange(len(row)): if len(row[i].translate(None, ''.join([chr(_) for _ in range(128)]))): row[i] = row[i].decode(default_charset, 'replace') yield row def _xls_iter(fname, i=0, book=None): '''Returns an iterator over the rows in the given worksheet(s) of the given XLS file. If book is None, it is opened.''' if book is None: try: book = xlrd.open_workbook(fname, formatting_info=True, logfile=_devnull) book.has_formatting = True except NotImplementedError: book = xlrd.open_workbook(fname, formatting_info=False, logfile=_devnull) book.has_formatting = False def iterator(): for j in [i] if i >= 0 else range(book.nsheets): # Separate multiple worksheets with a blank line if i < 0 and j > 0: yield [] sheet = book.sheet_by_index(j) for r in range(sheet.nrows): yield _xls_getrow(book, sheet, r) return iterator() def _xlsx_iter(fname, i=0, book=None): '''Returns an iterator over the rows in the given worksheet(s) of the given XLSX file. If book is None, it is opened.''' if book is None: book = openpyxl.load_workbook(fname, read_only=True, data_only=True) # Save the book so we can explicitly close it after reading the file _reader_books[fname] = book def iterator(): for j in [i] if i >= 0 else range(len(book.sheetnames)): # Separate multiple worksheets with a blank line if i < 0 and j > 0: yield [] sheet = book[book.sheetnames[j]] r = 0 for row in sheet.iter_rows(values_only=False): yield _xlsx_getrow(book, sheet, row, r) r += 1 return iterator() def _xls_getrow(book, sheet, r): '''Translate the given XLS row into text.''' row = [] for typ, value in zip(sheet.row_types(r), sheet.row_values(r)): # Types: # 0 = empty u'' # 1 = unicode text # 2 = float (convert to int if possible, then convert to string) # 3 = date (convert to unambiguous date/time string) # 4 = boolean (convert to string "0" or "1") # 5 = error (convert from code to error text) # 6 = blank u'' if typ == 2: if book.has_formatting: xf_index = sheet.cell_xf_index(r, len(row)) xf = book.xf_list[xf_index] fmt = book.format_map[xf.format_key] match = re.compile('\.(0+)').search(fmt.format_str) if match is not None: digits = len(match.group(1)) value = decimal.Decimal(str(value)).quantize(decimal.Decimal((0, (1,), -digits)), rounding=decimal.ROUND_HALF_UP) else: # Note: When reading xlsx files, formatting_info is not # present so we don't know where to round floating point # numbers to so we don't. This may not be acceptable but it # shouldn't really matter for our purposes as most files # that we import contain manually entered values. # However, that's not always true so this does matter. # This is not safe for use with .xlsx files. value = decimal.Decimal(str(value)) if value == int(value): value = int(value) value = str(value) elif typ == 3: try: # A date value of 1.0 means midnight (i.e. "24:00:00" which appears in-cell as "00:00:00"). # This is seen as ambiguous by xlrd (i.e. start or end of the day, presumably?). # But we want to see "00:00:00" so we change it to 0.0 before getting the tuple. if value == 1.0: value = 0.0 t = xlrd.xldate_as_tuple(value, book.datemode) # No date, just time if t[0] == 0 and t[1] == 0 and t[2] == 0: value = "%02d:%02d:%02d" % t[3:6] # No time, just date elif t[3] == 0 and t[4] == 0 and t[5] == 0: value = "%04d-%02d-%02d" % t[0:3] # Date and time else: value = "%04d-%02d-%02dT%02d:%02d:%02d" % t except xlrd.xldate.XLDateNegative: value = 'Negative Date: %s' % value except xlrd.xldate.XLDateAmbiguous: value = 'Ambiguous Date: %s' % value except xlrd.xldate.XLDateTooLarge: value = 'Too Large Date: %s' % value except xlrd.xldate.XLDateBadDatemode: value = 'Invalid Date Mode: (%s, %s)' % (value, book.datemode) elif typ == 4: value = str(value) elif typ == 5: value = xlrd.error_text_from_code[value] row.append(value) # Strip trailing empty cells while len(row) and row[-1] == '': row = row[:-1] return row def _xlsx_getrow(book, sheet, cellrow, r): '''Translate the given XLSX row into text.''' row = [] for cell in cellrow: typ, value, fmt = cell.data_type, cell.value, cell.number_format # Types: # 's' = unicode text # 'd' = datetime.datetime or datetime.time # 'n' = number (int or float) # 'e' = error if value is None: value = '' elif typ == 's': # String pass # value is already a string elif typ == 'd': # Date and/or time (ignore the 3 added microseconds) # No date, just time if isinstance(value, datetime.time): value = '%02d:%02d%s' % (value.hour, value.minute, (':%02d' % value.second) if 'S' in fmt else '') # Date and time, or just date elif isinstance(value, datetime.datetime): if value.hour or value.minute or value.second: value = "%04d-%02d-%02dT%02d:%02d%s" % (value.year, value.month, value.day, value.hour, value.minute, (':%02d' % value.second) if 'S' in fmt else '') else: # Just a date returned as a datetime.datetime value = "%04d-%02d-%02d" % (value.year, value.month, value.day) elif typ == 'n': # Number match = re.compile('\.(0+)').search(fmt) if match is not None: digits = len(match.group(1)) value = decimal.Decimal(str(value)).quantize(decimal.Decimal((0, (1,), -digits)), rounding=decimal.ROUND_HALF_UP) if value == int(value): val = int(value) value = str(value) elif typ == 'e' : # Error pass # Value is something like '#DIV/0!' or '#VALUE!' or '#REF!' else: #import sys #print('xlsx: Unexpected cell typ=%r val=%r fmt=%r' % (typ, value, fmt), file=sys.stderr) if not isinstance(value, basestring): value = str(value) row.append(value) # Strip trailing empty cells while len(row) and row[-1] == '': row = row[:-1] return row def convert_charset(fname): '''Convert the file with the given name from UTF32, UTF16 or UTF8-with-BOM to Latin-1.''' f = open(fname, 'rb') bytes = f.read() f.close() converted = None if bytes[0:4] == '\x00\x00\xfe\xff': # UTF-32, big-endian converted = bytes.decode('utf-32').encode('iso-8859-1', 'replace') elif bytes[0:4] == '\xff\xfe\x00\x00': # UTF-32, little-endian converted = bytes.decode('utf-32').encode('iso-8859-1', 'replace') elif bytes[0:2] == '\xfe\xff': # UTF-16, big-endian converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace') elif bytes[0:2] == '\xff\xfe': # UTF-16, little-endian converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace') elif bytes[0:3] == '\xef\xbb\xbf': # utf8 converted = bytes.decode('utf-16').encode('iso-8859-1', 'replace') if converted is not None: f = open(fname, 'wb') f.write(converted) f.close() #ifdef TEST if __name__ == '__main__': def _csv_quote(value): '''Return value quoted as needed for inclusion in a CSV file.''' if value is None: return '' if not isinstance(value, basestring): value = str(value) return '"' + value.replace('"', '""') + '"' if any([_ in value for _ in '",\r\n\v\f']) else value def main(): '''Test reader() by reading the csv or xls or xlsx file whose name is given on the command line and printing out its contents in csv format. It is up to the user to verify that it really worked.''' import sys if len(sys.argv) not in [2, 3]: print('usage: %s filename [sheetnumber (1-based)]' % sys.argv[0]) sys.exit(1) i = int(sys.argv[2]) - 1 if len(sys.argv) == 3 else -1 for row in reader(sys.argv[1], i): print('%s' % ','.join([_csv_quote(unicode(cell.strip()).encode('utf8')) for cell in row])) main() #endif TEST # vi:set ts=4 sw=4: