diff --git a/Sources/DataAccess/DataTableBuilder.cs b/Sources/DataAccess/DataTableBuilder.cs index 0601d80..1cf4aed 100644 --- a/Sources/DataAccess/DataTableBuilder.cs +++ b/Sources/DataAccess/DataTableBuilder.cs @@ -115,11 +115,11 @@ public static MutableDataTable GetMutableCopy(this DataTableBuilder builder, Dat /// ignored /// filename of table to load. Schema is inferred from header row. /// a in-memory table containing the topN rows from the supplied file. - public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, string filename) + public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, string filename, char columnSeparator = default (char)) { - return ReadSampleTopN(builder, filename, 100); + return ReadSampleTopN(builder, filename, columnSeparator, 100); } - + /// /// Return an in-memory table that contains the topN rows from the table in the filename. /// @@ -127,7 +127,7 @@ public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, str /// filename of table to load. Schema is inferred from header row. /// reads the topN rows from the table. /// a in-memory table containing the topN rows from the supplied file. - public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, string filename, int topN = 100) + public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, string filename, char columnSeparator = default(char), int topN = 100) { Debug.Assert(builder != null); if (filename == null) @@ -135,7 +135,7 @@ public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, str throw new ArgumentNullException("filename"); } - DataTable source = new FileStreamingDataTable(filename); + DataTable source = new FileStreamingDataTable(filename, columnSeparator); MutableDataTable dt = Analyze.SampleTopN(source, topN); return dt; } @@ -147,11 +147,11 @@ public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, str /// /// filename of CSV to read /// a streaming data table for the given filename - public static DataTable ReadLazy(this DataTableBuilder builder, string filename) + public static DataTable ReadLazy(this DataTableBuilder builder, string filename, char columnSeparator = default(char)) { Debug.Assert(builder != null); - return new FileStreamingDataTable(filename) { Name = filename }; + return new FileStreamingDataTable(filename, columnSeparator) { Name = filename }; } /// @@ -161,14 +161,13 @@ public static DataTable ReadLazy(this DataTableBuilder builder, string filename) /// /// input stream. Must be seekable and readable /// a streaming data table for the given filename - public static DataTable ReadLazy(this DataTableBuilder builder, Stream inputStream) + public static DataTable ReadLazy(this DataTableBuilder builder, Stream inputStream, char columnSeparator = default(char)) { Debug.Assert(builder != null); - return new StreamingDataTable(inputStream); + return new StreamingDataTable(inputStream, columnSeparator); } - /// /// Create an in-memory table with 2 columns (key and value), where each row is a KeyValuePair from the dictionary. /// diff --git a/Sources/DataAccess/Readers.cs b/Sources/DataAccess/Readers.cs index dd3879e..dc396ef 100644 --- a/Sources/DataAccess/Readers.cs +++ b/Sources/DataAccess/Readers.cs @@ -1,8 +1,10 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; + namespace DataAccess { @@ -225,24 +227,15 @@ public static MutableDataTable Read(TextReader stream, char delimiter = '\0') public static char GuessSeparateFromHeaderRow(string header) { - if (header.Contains("\t")) - { - return '\t'; - } - - if (header.Contains(",")) - { - return ','; - } - - if (header.Contains(";")) - { - return ';'; - } - - // Fallback is always comma. This implies a single column. - return ','; - + var validSeparators = new[] { '\t', ',', ';' }; + + var firstSeparator = + (from x in validSeparators.Select(c => new { separator = c, index = header.IndexOf(c) }) + where x.index >= 0 + orderby x.index + select x.separator).ToList(); + + return firstSeparator.Any() ? firstSeparator.FirstOrDefault() : ','; } // Read in a Ascii file that uses the given separate characters. diff --git a/Sources/DataAccess/StreamingDataTable.cs b/Sources/DataAccess/StreamingDataTable.cs index 875c579..fcfb13a 100644 --- a/Sources/DataAccess/StreamingDataTable.cs +++ b/Sources/DataAccess/StreamingDataTable.cs @@ -15,7 +15,8 @@ internal class StreamingDataTable : TextReaderDataTable { readonly Stream _input; - public StreamingDataTable(Stream input) + public StreamingDataTable(Stream input, char columnSeparator) + : base(columnSeparator) { // We could optimize to avoid requiring CanSeek if we failed on attemps // to read the the rows multiple times. @@ -30,7 +31,7 @@ protected override TextReader OpenText() { _input.Position = 0; - + return new StreamReader(_input); } protected override void CloseText(TextReader reader) @@ -43,8 +44,9 @@ protected override void CloseText(TextReader reader) internal class FileStreamingDataTable : TextReaderDataTable { private readonly string _filename; - - public FileStreamingDataTable(string filename) + + public FileStreamingDataTable(string filename, char columnSeparator) + : base(columnSeparator) { _filename = filename; } @@ -65,9 +67,15 @@ protected override void CloseText(TextReader reader) /// internal abstract class TextReaderDataTable : DataTable { + private readonly char columnSeparator; + private string[] _names; - + protected TextReaderDataTable(char columnSeparator) + { + this.columnSeparator = columnSeparator; + } + public override IEnumerable ColumnNames { get @@ -77,10 +85,10 @@ public override IEnumerable ColumnNames TextReader sr = null; try { - sr = this.OpenText(); + sr = this.OpenText(); // First get columns. string header = sr.ReadLine(); - char ch = Reader.GuessSeparateFromHeaderRow(header); + char ch = this.columnSeparator == default(char) ? Reader.GuessSeparateFromHeaderRow(header) : this.columnSeparator; _names = Reader.split(header, ch); } finally @@ -100,7 +108,7 @@ public override IEnumerable ColumnNames // called on reader from OpenText // Don't call dipose because that can close streams. protected abstract void CloseText(TextReader reader); - + public override IEnumerable Rows { get @@ -113,7 +121,7 @@ public override IEnumerable Rows sr = this.OpenText(); string header = sr.ReadLine(); // skip past header - char chSeparator = Reader.GuessSeparateFromHeaderRow(header); + char chSeparator = this.columnSeparator == default(char) ? Reader.GuessSeparateFromHeaderRow(header) : this.columnSeparator; int illegal = 0; string line; @@ -124,7 +132,7 @@ public override IEnumerable Rows { string[] parts = Reader.split(line, chSeparator); - + // $$$ Major hack for dealing with newlines in quotes strings. // The better fix here would be to switch to a streaming interface. if (parts.Length != columnCount)