579 lines
16 KiB
C++
579 lines
16 KiB
C++
|
/**
|
||
|
* csv_parser Header File
|
||
|
*
|
||
|
* This object is used to parse text documents that are delimited by some
|
||
|
* type of character. Some of the common ones use spaces, tabs, commas and semi-colons.
|
||
|
*
|
||
|
* This is a list of common characters encountered by this program
|
||
|
*
|
||
|
* This list was prepared from the data from http://www.asciitable.com
|
||
|
*
|
||
|
* @li DEC is how it would be represented in decimal form (base 10)
|
||
|
* @li HEX is how it would be represented in hexadecimal format (base 16)
|
||
|
*
|
||
|
* @li DEC HEX Character Name
|
||
|
* @li 0 0x00 null
|
||
|
* @li 9 0x09 horizontal tab
|
||
|
* @li 10 0x0A line feed, new line
|
||
|
* @li 13 0x0D carriage return
|
||
|
* @li 27 0x1B escape
|
||
|
* @li 32 0x20 space
|
||
|
* @li 33 0x21 double quote
|
||
|
* @li 39 0x27 single quote
|
||
|
* @li 44 0x2C comma
|
||
|
* @li 92 0x5C backslash
|
||
|
*
|
||
|
* @author Israel Ekpo <israel.ekpo@israelekpo.com>
|
||
|
*/
|
||
|
|
||
|
#ifndef CSV_PARSER_HPP_INCLUDED
|
||
|
|
||
|
#define CSV_PARSER_HPP_INCLUDED
|
||
|
|
||
|
#define LIBCSV_PARSER_MAJOR_VERSION 1
|
||
|
|
||
|
#define LIBCSV_PARSER_MINOR_VERSION 0
|
||
|
|
||
|
#define LIBCSV_PARSER_PATCH_VERSION 0
|
||
|
|
||
|
#define LIBCSV_PARSER_VERSION_NUMBER 10000
|
||
|
|
||
|
/* C++ header files */
|
||
|
#include <string>
|
||
|
#include <vector>
|
||
|
|
||
|
|
||
|
/* C header files */
|
||
|
#include <cstdio>
|
||
|
#include <cstring>
|
||
|
#include <cstdlib>
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
/**
|
||
|
* @typedef csv_row
|
||
|
*
|
||
|
* Data structure used to represent a record.
|
||
|
*
|
||
|
* This is an alias for vector <string>
|
||
|
*/
|
||
|
typedef vector <string> csv_row;
|
||
|
|
||
|
/**
|
||
|
* @typedef csv_row_ptr
|
||
|
*
|
||
|
* Pointer to a csv_row object
|
||
|
*
|
||
|
* Expands to vector <string> *
|
||
|
*/
|
||
|
typedef csv_row * csv_row_ptr;
|
||
|
|
||
|
/**
|
||
|
* @typedef enclosure_type_t
|
||
|
*
|
||
|
* This enum type is used to set the mode in which the CSV file is parsed.
|
||
|
*
|
||
|
* @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields
|
||
|
* @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields
|
||
|
* @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional
|
||
|
*
|
||
|
* The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used.
|
||
|
*/
|
||
|
typedef enum
|
||
|
{
|
||
|
ENCLOSURE_TYPE_BEGIN = 0,
|
||
|
ENCLOSURE_NONE = 1,
|
||
|
ENCLOSURE_REQUIRED = 2,
|
||
|
ENCLOSURE_OPTIONAL = 3,
|
||
|
ENCLOSURE_TYPE_END
|
||
|
|
||
|
} enclosure_type_t;
|
||
|
|
||
|
/**
|
||
|
* @def CSV_PARSER_FREE_BUFFER_PTR(ptr)
|
||
|
*
|
||
|
* Used to deallocate buffer pointers
|
||
|
*
|
||
|
* It deallocates the pointer only if it is not null
|
||
|
*/
|
||
|
#define CSV_PARSER_FREE_BUFFER_PTR(ptr) \
|
||
|
if (ptr != NULL) \
|
||
|
{ \
|
||
|
free(ptr); \
|
||
|
\
|
||
|
ptr = NULL; \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @def CSV_PARSER_FREE_FILE_PTR(fptr)
|
||
|
*
|
||
|
* Used to close open file handles
|
||
|
*
|
||
|
* It closes the file only if it is not null
|
||
|
*/
|
||
|
#define CSV_PARSER_FREE_FILE_PTR(fptr) \
|
||
|
if (fptr != NULL) \
|
||
|
{ \
|
||
|
fclose(fptr); \
|
||
|
\
|
||
|
fptr = NULL; \
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @class csv_parser
|
||
|
*
|
||
|
* The csv_parser object
|
||
|
*
|
||
|
* Used to parse text files to extract records and fields.
|
||
|
*
|
||
|
* We are making the following assumptions :
|
||
|
*
|
||
|
* @li The record terminator is only one character in length.
|
||
|
* @li The field terminator is only one character in length.
|
||
|
* @li The fields are enclosed by single characters, if any.
|
||
|
*
|
||
|
* @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed.
|
||
|
* @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character.
|
||
|
*
|
||
|
* The CSV files can be parsed in 3 modes.
|
||
|
* @li (a) No enclosures
|
||
|
* @li (b) Fields always enclosed.
|
||
|
* @li (c) Fields optionally enclosed.
|
||
|
*
|
||
|
* For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning
|
||
|
* or the end of the string, it is assumed that the field is enclosed.
|
||
|
*
|
||
|
* The csv_parser::init() method can accept a character array as the path to the CSV file.
|
||
|
* Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading.
|
||
|
*
|
||
|
* The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which
|
||
|
* controls how the text file is going to be parsed.
|
||
|
*
|
||
|
* @see csv_parser::set_enclosed_char()
|
||
|
* @see enclosure_type_t
|
||
|
*
|
||
|
* @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char.
|
||
|
* @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string.
|
||
|
* @todo Add ability to set strings where line end by. Currently lines can only end with a single char.
|
||
|
* @todo Add ability to accept other escape characters besides the backslash character 0x5C.
|
||
|
* @todo More support for improperly formatted CSV data files.
|
||
|
*
|
||
|
* @author Israel Ekpo <israel.ekpo@israelekpo.com>
|
||
|
*/
|
||
|
class csv_parser
|
||
|
{
|
||
|
|
||
|
public :
|
||
|
|
||
|
/**
|
||
|
* Class constructor
|
||
|
*
|
||
|
* This is the default constructor.
|
||
|
*
|
||
|
* All the internal attributes are initialized here
|
||
|
*
|
||
|
* @li The enclosure character is initialized to NULL 0x00.
|
||
|
* @li The escape character is initialized to the backslash character 0x5C.
|
||
|
* @li The field delimiter character is initialized to a comma 0x2C.
|
||
|
* @li The record delimiter character is initialized to a new line character 0x0A.
|
||
|
*
|
||
|
* @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively.
|
||
|
* @li The number of records to ignore is set to zero.
|
||
|
* @li The more_rows internal attribute is set to false.
|
||
|
* @li The pointer to the CSV input file is initialized to NULL
|
||
|
* @li The pointer to the buffer for the file name is also initialized to NULL
|
||
|
*/
|
||
|
csv_parser() : enclosed_char(0x00), escaped_char(0x5C),
|
||
|
field_term_char(0x2C), line_term_char(0x0A),
|
||
|
enclosed_length(0U), escaped_length(1U),
|
||
|
field_term_length(1U), line_term_length(1U),
|
||
|
ignore_num_lines(0U), record_count(0U),
|
||
|
input_fp(NULL), input_filename(NULL),
|
||
|
enclosure_type(ENCLOSURE_NONE),
|
||
|
more_rows(false)
|
||
|
{ }
|
||
|
|
||
|
/**
|
||
|
* Class destructor
|
||
|
*
|
||
|
* In the class destructor the file pointer to the input CSV file is closed and
|
||
|
* the buffer to the input file name is also deallocated.
|
||
|
*
|
||
|
* @see csv_parser::input_fp
|
||
|
* @see csv_parser::input_filename
|
||
|
*/
|
||
|
~csv_parser()
|
||
|
{
|
||
|
CSV_PARSER_FREE_FILE_PTR(input_fp);
|
||
|
|
||
|
CSV_PARSER_FREE_BUFFER_PTR(input_filename);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Initializes the current object
|
||
|
*
|
||
|
* This init method accepts a pointer to the CSV file that has been opened for reading
|
||
|
*
|
||
|
* It also resets the file pointer to the beginning of the stream
|
||
|
*
|
||
|
* @overload bool init(FILE * input_file_pointer)
|
||
|
* @param[in] input_file_pointer
|
||
|
* @return bool Returns true on success and false on error.
|
||
|
*/
|
||
|
bool init(FILE * input_file_pointer);
|
||
|
|
||
|
/**
|
||
|
* Initializes the current object
|
||
|
*
|
||
|
* @li This init method accepts a character array as the path to the csv file.
|
||
|
* @li It sets the value of the csv_parser::input_filename property.
|
||
|
* @li Then it creates a pointer to the csv_parser::input_fp property.
|
||
|
*
|
||
|
* @overload bool init(const char * input_filename)
|
||
|
* @param[in] input_filename
|
||
|
* @return bool Returns true on success and false on error.
|
||
|
*/
|
||
|
bool init(const char * input_filename);
|
||
|
|
||
|
/**
|
||
|
* Defines the Field Enclosure character used in the Text File
|
||
|
*
|
||
|
* Setting this to NULL means that the enclosure character is optional.
|
||
|
*
|
||
|
* If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record.
|
||
|
*
|
||
|
* @param[in] fields_enclosed_by The character used to enclose the fields.
|
||
|
* @param[in] enclosure_mode How the CSV file should be parsed.
|
||
|
* @return void
|
||
|
*/
|
||
|
void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode);
|
||
|
|
||
|
/**
|
||
|
* Defines the Field Delimiter character used in the text file
|
||
|
*
|
||
|
* @param[in] fields_terminated_by
|
||
|
* @return void
|
||
|
*/
|
||
|
void set_field_term_char(char fields_terminated_by);
|
||
|
|
||
|
/**
|
||
|
* Defines the Record Terminator character used in the text file
|
||
|
*
|
||
|
* @param[in] lines_terminated_by
|
||
|
* @return void
|
||
|
*/
|
||
|
void set_line_term_char(char lines_terminated_by);
|
||
|
|
||
|
/**
|
||
|
* Returns whether there is still more data
|
||
|
*
|
||
|
* This method returns a boolean value indicating whether or not there are
|
||
|
* still more records to be extracted in the current file being parsed.
|
||
|
*
|
||
|
* Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row()
|
||
|
*
|
||
|
* @see csv_parser::get_row()
|
||
|
* @see csv_parser::more_rows
|
||
|
*
|
||
|
* @return bool Returns true if there are still more rows and false if there is not.
|
||
|
*/
|
||
|
bool has_more_rows(void)
|
||
|
{
|
||
|
return more_rows;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Defines the number of records to discard
|
||
|
*
|
||
|
* The number of records specified will be discarded during the parsing process.
|
||
|
*
|
||
|
* @see csv_parser::_skip_lines()
|
||
|
* @see csv_parser::get_row()
|
||
|
* @see csv_parser::has_more_rows()
|
||
|
*
|
||
|
* @param[in] lines_to_skip How many records should be skipped
|
||
|
* @return void
|
||
|
*/
|
||
|
void set_skip_lines(unsigned int lines_to_skip)
|
||
|
{
|
||
|
ignore_num_lines = lines_to_skip;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Return the current row from the CSV file
|
||
|
*
|
||
|
* The row is returned as a vector of string objects.
|
||
|
*
|
||
|
* This method should be called only if csv_parser::has_more_rows() is true
|
||
|
*
|
||
|
* @see csv_parser::has_more_rows()
|
||
|
* @see csv_parser::get_record_count()
|
||
|
* @see csv_parser::reset_record_count()
|
||
|
* @see csv_parser::more_rows
|
||
|
*
|
||
|
* @return csv_row A vector type containing an array of strings
|
||
|
*/
|
||
|
csv_row get_row(void);
|
||
|
|
||
|
/**
|
||
|
* Returns the number of times the csv_parser::get_row() method has been invoked
|
||
|
*
|
||
|
* @see csv_parser::reset_record_count()
|
||
|
* @return unsigned int The number of times the csv_parser::get_row() method has been invoked.
|
||
|
*/
|
||
|
unsigned int get_record_count(void)
|
||
|
{
|
||
|
return record_count;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Resets the record_count internal attribute to zero
|
||
|
*
|
||
|
* This may be used if the object is reused multiple times.
|
||
|
*
|
||
|
* @see csv_parser::record_count
|
||
|
* @see csv_parser::get_record_count()
|
||
|
* @return void
|
||
|
*/
|
||
|
void reset_record_count(void)
|
||
|
{
|
||
|
record_count = 0U;
|
||
|
}
|
||
|
|
||
|
private :
|
||
|
|
||
|
/**
|
||
|
* Ignores N records in the CSV file
|
||
|
*
|
||
|
* Where N is the value of the csv_parser::ignore_num_lines internal property.
|
||
|
*
|
||
|
* The number of lines skipped can be defined by csv_parser::set_skip_lines()
|
||
|
*
|
||
|
* @see csv_parser::set_skip_lines()
|
||
|
*
|
||
|
* @return void
|
||
|
*/
|
||
|
void _skip_lines(void);
|
||
|
|
||
|
/**
|
||
|
* Reads a Single Line
|
||
|
*
|
||
|
* Reads a single record into the buffer passed by reference to the method
|
||
|
*
|
||
|
* @param[in,out] buffer A pointer to a character array for the current line.
|
||
|
* @param[out] buffer_len A pointer to an integer storing the length of the buffer.
|
||
|
* @return void
|
||
|
*/
|
||
|
void _read_single_line(char ** buffer, unsigned int * buffer_len);
|
||
|
|
||
|
/**
|
||
|
* Extracts the fields without enclosures
|
||
|
*
|
||
|
* This is used when the enclosure character is not set
|
||
|
* @param[out] row The vector of strings
|
||
|
* @param[in] line The character array buffer containing the current record/line
|
||
|
* @param[in] line_length The length of the buffer
|
||
|
*/
|
||
|
void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
|
||
|
|
||
|
/**
|
||
|
* Extracts the fields with enclosures
|
||
|
*
|
||
|
* This is used when the enclosure character is set.
|
||
|
*
|
||
|
* @param[out] row The vector of strings
|
||
|
* @param[in] line The character array buffer containing the current record/line
|
||
|
* @param[in] line_length The length of the buffer
|
||
|
*/
|
||
|
void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
|
||
|
|
||
|
/**
|
||
|
* Extracts the fields when enclosure is optional
|
||
|
*
|
||
|
* This is used when the enclosure character is optional
|
||
|
*
|
||
|
* Hence, there could be fields that use it, and fields that don't.
|
||
|
*
|
||
|
* @param[out] row The vector of strings
|
||
|
* @param[in] line The character array buffer containing the current record/line
|
||
|
* @param[in] line_length The length of the buffer
|
||
|
*/
|
||
|
void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length);
|
||
|
|
||
|
protected :
|
||
|
|
||
|
/**
|
||
|
* The enclosure character
|
||
|
*
|
||
|
* If present or used for a field it is assumed that both ends of the fields are wrapped.
|
||
|
*
|
||
|
* This is that single character used in the document to wrap the fields.
|
||
|
*
|
||
|
* @see csv_parser::_get_fields_without_enclosure()
|
||
|
* @see csv_parser::_get_fields_with_enclosure()
|
||
|
* @see csv_parser::_get_fields_with_optional_enclosure()
|
||
|
*
|
||
|
* @var enclosed_char
|
||
|
*/
|
||
|
char enclosed_char;
|
||
|
|
||
|
/**
|
||
|
* The escape character
|
||
|
*
|
||
|
* For now the only valid escape character allowed is the backslash character 0x5C
|
||
|
*
|
||
|
* This is only important when the enclosure character is required or optional.
|
||
|
*
|
||
|
* This is the backslash character used to escape enclosure characters found within the fields.
|
||
|
*
|
||
|
* @see csv_parser::_get_fields_with_enclosure()
|
||
|
* @see csv_parser::_get_fields_with_optional_enclosure()
|
||
|
* @todo Update the code to accept other escape characters besides the backslash
|
||
|
*
|
||
|
* @var escaped_char
|
||
|
*/
|
||
|
char escaped_char;
|
||
|
|
||
|
/**
|
||
|
* The field terminator
|
||
|
*
|
||
|
* This is the single character used to mark the end of a column in the text file.
|
||
|
*
|
||
|
* Common characters used include the comma, tab, and semi-colons.
|
||
|
*
|
||
|
* This is the single character used to separate fields within a record.
|
||
|
*
|
||
|
* @var field_term_char
|
||
|
*/
|
||
|
char field_term_char;
|
||
|
|
||
|
/**
|
||
|
* The record terminator
|
||
|
*
|
||
|
* This is the single character used to mark the end of a record in the text file.
|
||
|
*
|
||
|
* The most popular one is the new line character however it is possible to use others as well.
|
||
|
*
|
||
|
* This is the single character used to mark the end of a record
|
||
|
*
|
||
|
* @see csv_parser::get_row()
|
||
|
*
|
||
|
* @var line_term_char
|
||
|
*/
|
||
|
char line_term_char;
|
||
|
|
||
|
/**
|
||
|
* Enclosure length
|
||
|
*
|
||
|
* This is the length of the enclosure character
|
||
|
*
|
||
|
* @see csv_parser::csv_parser()
|
||
|
* @see csv_parser::set_enclosed_char()
|
||
|
*
|
||
|
* @var enclosed_length
|
||
|
*/
|
||
|
unsigned int enclosed_length;
|
||
|
|
||
|
/**
|
||
|
* The length of the escape character
|
||
|
*
|
||
|
* Right now this is really not being used.
|
||
|
*
|
||
|
* It may be used in future versions of the object.
|
||
|
*
|
||
|
* @todo Update the code to accept other escape characters besides the backslash
|
||
|
*
|
||
|
* @var escaped_length
|
||
|
*/
|
||
|
unsigned int escaped_length;
|
||
|
|
||
|
/**
|
||
|
* Length of the field terminator
|
||
|
*
|
||
|
* For now this is not being used. It will be used in future versions of the object.
|
||
|
*
|
||
|
* @var field_term_length
|
||
|
*/
|
||
|
unsigned int field_term_length;
|
||
|
|
||
|
/**
|
||
|
* Length of the record terminator
|
||
|
*
|
||
|
* For now this is not being used. It will be used in future versions of the object.
|
||
|
*
|
||
|
* @var line_term_length
|
||
|
*/
|
||
|
unsigned int line_term_length;
|
||
|
|
||
|
/**
|
||
|
* Number of records to discard
|
||
|
*
|
||
|
* This variable controls how many records in the file are skipped before parsing begins.
|
||
|
*
|
||
|
* @see csv_parser::_skip_lines()
|
||
|
* @see csv_parser::set_skip_lines()
|
||
|
*
|
||
|
* @var ignore_num_lines
|
||
|
*/
|
||
|
unsigned int ignore_num_lines;
|
||
|
|
||
|
/**
|
||
|
* Number of times the get_row() method has been called
|
||
|
*
|
||
|
* @see csv_parser::get_row()
|
||
|
* @var record_count
|
||
|
*/
|
||
|
unsigned int record_count;
|
||
|
|
||
|
/**
|
||
|
* The CSV File Pointer
|
||
|
*
|
||
|
* This is the pointer to the CSV file
|
||
|
*
|
||
|
* @var input_fp
|
||
|
*/
|
||
|
FILE * input_fp;
|
||
|
|
||
|
/**
|
||
|
* Buffer to input file name
|
||
|
*
|
||
|
* This buffer is used to store the name of the file that is being parsed
|
||
|
*
|
||
|
* @var input_filename
|
||
|
*/
|
||
|
char * input_filename;
|
||
|
|
||
|
/**
|
||
|
* Mode in which the CSV file will be parsed
|
||
|
*
|
||
|
* The various values are explained below
|
||
|
*
|
||
|
* @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields
|
||
|
* @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields
|
||
|
* @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional
|
||
|
*
|
||
|
* @see csv_parser::get_row()
|
||
|
* @see csv_parser::_read_single_line()
|
||
|
* @see csv_parser::_get_fields_without_enclosure()
|
||
|
* @see csv_parser::_get_fields_with_enclosure()
|
||
|
* @see csv_parser::_get_fields_with_optional_enclosure()
|
||
|
*
|
||
|
* @var enclosure_type
|
||
|
*/
|
||
|
enclosure_type_t enclosure_type;
|
||
|
|
||
|
/**
|
||
|
* There are still more records to parse
|
||
|
*
|
||
|
* This boolean property is an internal indicator of whether there are still records in the
|
||
|
* file to be parsed.
|
||
|
*
|
||
|
* @see csv_parser::has_more_rows()
|
||
|
* @var more_rows
|
||
|
*/
|
||
|
bool more_rows;
|
||
|
|
||
|
}; /* class csv_parser */
|
||
|
|
||
|
#endif /* CSV_PARSER_HPP_INCLUDED */
|