/* ==================================================================== Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ namespace HH.WMS.Utils.NPOI.HSSF.Extractor { using System; using System.Text; using System.IO; using HH.WMS.Utils.NPOI.HSSF.UserModel; using HH.WMS.Utils.NPOI.HSSF.Record; using HH.WMS.Utils.NPOI.POIFS.FileSystem; using HH.WMS.Utils.NPOI; using HH.WMS.Utils.NPOI.SS.Formula.Eval; using HH.WMS.Utils.NPOI.SS.UserModel; /// /// A text extractor for Excel files. /// Returns the textual content of the file, suitable for /// indexing by something like Lucene, but not really /// intended for display to the user. /// public class ExcelExtractor : POIOLE2TextExtractor { private HSSFWorkbook wb; private HSSFDataFormatter _formatter; private bool includeSheetNames = true; private bool formulasNotResults = false; private bool includeCellComments = false; private bool includeBlankCells = false; private bool includeHeaderFooter = true; /// /// Initializes a new instance of the class. /// /// The wb. public ExcelExtractor(HSSFWorkbook wb) : base(wb) { this.wb = wb; _formatter = new HSSFDataFormatter(); } /// /// Initializes a new instance of the class. /// /// The fs. public ExcelExtractor(POIFSFileSystem fs) : this(new HSSFWorkbook(fs)) { } /// /// Should header and footer be included? Default is true /// public bool IncludeHeaderFooter { get { return this.includeHeaderFooter; } set { this.includeHeaderFooter = value; } } /// /// Should sheet names be included? Default is true /// /// if set to true [include sheet names]. public bool IncludeSheetNames { get { return this.includeSheetNames; } set { this.includeSheetNames = value; } } /// /// Should we return the formula itself, and not /// the result it produces? Default is false /// /// if set to true [formulas not results]. public bool FormulasNotResults { get { return this.formulasNotResults; } set { this.formulasNotResults = value; } } /// /// Should cell comments be included? Default is false /// /// if set to true [include cell comments]. public bool IncludeCellComments { get { return this.includeCellComments; } set { this.includeCellComments = value; } } /// /// Should blank cells be output? Default is to only /// output cells that are present in the file and are /// non-blank. /// /// if set to true [include blank cells]. public bool IncludeBlankCells { get { return this.includeBlankCells; } set { this.includeBlankCells = value; } } /// /// Retreives the text contents of the file /// /// All the text from the document. public override String Text { get { StringBuilder text = new StringBuilder(); // We don't care about the differnce between // null (missing) and blank cells wb.MissingCellPolicy = MissingCellPolicy.RETURN_BLANK_AS_NULL; // Process each sheet in turn for (int i = 0; i < wb.NumberOfSheets; i++) { HSSFSheet sheet = (HSSFSheet)wb.GetSheetAt(i); if (sheet == null) { continue; } if (includeSheetNames) { String name = wb.GetSheetName(i); if (name != null) { text.Append(name); text.Append("\n"); } } // Header text, if there is any if (sheet.Header != null && includeHeaderFooter) { text.Append( ExtractHeaderFooter(sheet.Header) ); } int firstRow = sheet.FirstRowNum; int lastRow = sheet.LastRowNum; for (int j = firstRow; j <= lastRow; j++) { IRow row = sheet.GetRow(j); if (row == null) { continue; } // Check each cell in turn int firstCell = row.FirstCellNum; int lastCell = row.LastCellNum; if (includeBlankCells) { firstCell = 0; } for (int k = firstCell; k < lastCell; k++) { ICell cell = row.GetCell(k); bool outputContents = true; if (cell == null) { // Only output if requested outputContents = includeBlankCells; } else { switch (cell.CellType) { case CellType.STRING: text.Append(cell.RichStringCellValue.String); break; case CellType.NUMERIC: // Note - we don't apply any formatting! //text.Append(cell.NumericCellValue); text.Append(_formatter.FormatCellValue(cell)); break; case CellType.BOOLEAN: text.Append(cell.BooleanCellValue); break; case CellType.ERROR: text.Append(ErrorEval.GetText(cell.ErrorCellValue)); break; case CellType.FORMULA: if (formulasNotResults) { text.Append(cell.CellFormula); } else { switch (cell.CachedFormulaResultType) { case CellType.STRING: IRichTextString str = cell.RichStringCellValue; if (str != null && str.Length > 0) { text.Append(str.ToString()); } break; case CellType.NUMERIC: //text.Append(cell.NumericCellValue); HSSFCellStyle style = (HSSFCellStyle)cell.CellStyle; if (style == null) { text.Append(cell.NumericCellValue); } else { text.Append( _formatter.FormatRawCellContents( cell.NumericCellValue, style.DataFormat, style.GetDataFormatString() ) ); } break; case CellType.BOOLEAN: text.Append(cell.BooleanCellValue); break; case CellType.ERROR: text.Append(ErrorEval.GetText(cell.ErrorCellValue)); break; } } break; default: throw new Exception("Unexpected cell type (" + cell.CellType + ")"); } // Output the comment, if requested and exists HH.WMS.Utils.NPOI.SS.UserModel.IComment comment = cell.CellComment; if (includeCellComments && comment != null) { // Replace any newlines with spaces, otherwise it // breaks the output String commentText = comment.String.String.Replace('\n', ' '); text.Append(" Comment by " + comment.Author + ": " + commentText); } } // Output a tab if we're not on the last cell if (outputContents && k < (lastCell - 1)) { text.Append("\t"); } } // Finish off the row text.Append("\n"); } // Finally Feader text, if there is any if (sheet.Footer != null && includeHeaderFooter) { text.Append( ExtractHeaderFooter(sheet.Footer) ); } } return text.ToString(); } } /// /// Extracts the header footer. /// /// The header or footer /// private String ExtractHeaderFooter(HH.WMS.Utils.NPOI.SS.UserModel.IHeaderFooter hf) { StringBuilder text = new StringBuilder(); if (hf.Left != null) { text.Append(hf.Left); } if (hf.Center != null) { if (text.Length > 0) text.Append("\t"); text.Append(hf.Center); } if (hf.Right != null) { if (text.Length > 0) text.Append("\t"); text.Append(hf.Right); } if (text.Length > 0) text.Append("\n"); return text.ToString(); } } }