/** * csv_parser Header File * * This object is used to parse text documents that are delimited by some * type of character. Some of the common ones use spaces, tabs, commas and semi-colons. * * This is a list of common characters encountered by this program * * This list was prepared from the data from http://www.asciitable.com * * @li DEC is how it would be represented in decimal form (base 10) * @li HEX is how it would be represented in hexadecimal format (base 16) * * @li DEC HEX Character Name * @li 0 0x00 null * @li 9 0x09 horizontal tab * @li 10 0x0A line feed, new line * @li 13 0x0D carriage return * @li 27 0x1B escape * @li 32 0x20 space * @li 33 0x21 double quote * @li 39 0x27 single quote * @li 44 0x2C comma * @li 92 0x5C backslash * * @author Israel Ekpo */ #ifndef CSV_PARSER_HPP_INCLUDED #define CSV_PARSER_HPP_INCLUDED #define LIBCSV_PARSER_MAJOR_VERSION 1 #define LIBCSV_PARSER_MINOR_VERSION 0 #define LIBCSV_PARSER_PATCH_VERSION 0 #define LIBCSV_PARSER_VERSION_NUMBER 10000 /* C++ header files */ #include #include /* C header files */ #include #include #include using namespace std; /** * @typedef csv_row * * Data structure used to represent a record. * * This is an alias for vector */ typedef vector csv_row; /** * @typedef csv_row_ptr * * Pointer to a csv_row object * * Expands to vector * */ typedef csv_row * csv_row_ptr; /** * @typedef enclosure_type_t * * This enum type is used to set the mode in which the CSV file is parsed. * * @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields * @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields * @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional * * The ENCLOSURE_TYPE_BEGIN and ENCLOSURE_TYPE_END members of this enum definition are never to be used. */ typedef enum { ENCLOSURE_TYPE_BEGIN = 0, ENCLOSURE_NONE = 1, ENCLOSURE_REQUIRED = 2, ENCLOSURE_OPTIONAL = 3, ENCLOSURE_TYPE_END } enclosure_type_t; /** * @def CSV_PARSER_FREE_BUFFER_PTR(ptr) * * Used to deallocate buffer pointers * * It deallocates the pointer only if it is not null */ #define CSV_PARSER_FREE_BUFFER_PTR(ptr) \ if (ptr != NULL) \ { \ free(ptr); \ \ ptr = NULL; \ } /** * @def CSV_PARSER_FREE_FILE_PTR(fptr) * * Used to close open file handles * * It closes the file only if it is not null */ #define CSV_PARSER_FREE_FILE_PTR(fptr) \ if (fptr != NULL) \ { \ fclose(fptr); \ \ fptr = NULL; \ } /** * @class csv_parser * * The csv_parser object * * Used to parse text files to extract records and fields. * * We are making the following assumptions : * * @li The record terminator is only one character in length. * @li The field terminator is only one character in length. * @li The fields are enclosed by single characters, if any. * * @li The parser can handle documents where fields are always enclosed, not enclosed at all or optionally enclosed. * @li When fields are strictly all enclosed, there is an assumption that any enclosure characters within the field are escaped by placing a backslash in front of the enclosure character. * * The CSV files can be parsed in 3 modes. * @li (a) No enclosures * @li (b) Fields always enclosed. * @li (c) Fields optionally enclosed. * * For option (c) when the enclosure character is optional, if an enclosure character is spotted at either the beginning * or the end of the string, it is assumed that the field is enclosed. * * The csv_parser::init() method can accept a character array as the path to the CSV file. * Since it is overloaded, it can also accept a FILE pointer to a stream that is already open for reading. * * The set_enclosed_char() method accepts the field enclosure character as the first parameter and the enclosure mode as the second parameter which * controls how the text file is going to be parsed. * * @see csv_parser::set_enclosed_char() * @see enclosure_type_t * * @todo Add ability to parse files where fields/columns are terminated by strings instead of just one char. * @todo Add ability to set strings where lines start by. Currently lines do not have any starting char or string. * @todo Add ability to set strings where line end by. Currently lines can only end with a single char. * @todo Add ability to accept other escape characters besides the backslash character 0x5C. * @todo More support for improperly formatted CSV data files. * * @author Israel Ekpo */ class csv_parser { public : /** * Class constructor * * This is the default constructor. * * All the internal attributes are initialized here * * @li The enclosure character is initialized to NULL 0x00. * @li The escape character is initialized to the backslash character 0x5C. * @li The field delimiter character is initialized to a comma 0x2C. * @li The record delimiter character is initialized to a new line character 0x0A. * * @li The lengths of all the above-mentioned fields are initialized to 0,1,1 and 1 respectively. * @li The number of records to ignore is set to zero. * @li The more_rows internal attribute is set to false. * @li The pointer to the CSV input file is initialized to NULL * @li The pointer to the buffer for the file name is also initialized to NULL */ csv_parser() : enclosed_char(0x00), escaped_char(0x5C), field_term_char(0x2C), line_term_char(0x0A), enclosed_length(0U), escaped_length(1U), field_term_length(1U), line_term_length(1U), ignore_num_lines(0U), record_count(0U), input_fp(NULL), input_filename(NULL), enclosure_type(ENCLOSURE_NONE), more_rows(false) { } /** * Class destructor * * In the class destructor the file pointer to the input CSV file is closed and * the buffer to the input file name is also deallocated. * * @see csv_parser::input_fp * @see csv_parser::input_filename */ ~csv_parser() { CSV_PARSER_FREE_FILE_PTR(input_fp); CSV_PARSER_FREE_BUFFER_PTR(input_filename); } /** * Initializes the current object * * This init method accepts a pointer to the CSV file that has been opened for reading * * It also resets the file pointer to the beginning of the stream * * @overload bool init(FILE * input_file_pointer) * @param[in] input_file_pointer * @return bool Returns true on success and false on error. */ bool init(FILE * input_file_pointer); /** * Initializes the current object * * @li This init method accepts a character array as the path to the csv file. * @li It sets the value of the csv_parser::input_filename property. * @li Then it creates a pointer to the csv_parser::input_fp property. * * @overload bool init(const char * input_filename) * @param[in] input_filename * @return bool Returns true on success and false on error. */ bool init(const char * input_filename); /** * Defines the Field Enclosure character used in the Text File * * Setting this to NULL means that the enclosure character is optional. * * If the enclosure is optional, there could be fields that are enclosed, and fields that are not enclosed within the same line/record. * * @param[in] fields_enclosed_by The character used to enclose the fields. * @param[in] enclosure_mode How the CSV file should be parsed. * @return void */ void set_enclosed_char(char fields_enclosed_by, enclosure_type_t enclosure_mode); /** * Defines the Field Delimiter character used in the text file * * @param[in] fields_terminated_by * @return void */ void set_field_term_char(char fields_terminated_by); /** * Defines the Record Terminator character used in the text file * * @param[in] lines_terminated_by * @return void */ void set_line_term_char(char lines_terminated_by); /** * Returns whether there is still more data * * This method returns a boolean value indicating whether or not there are * still more records to be extracted in the current file being parsed. * * Call this method to see if there are more rows to retrieve before invoking csv_parser::get_row() * * @see csv_parser::get_row() * @see csv_parser::more_rows * * @return bool Returns true if there are still more rows and false if there is not. */ bool has_more_rows(void) { return more_rows; } /** * Defines the number of records to discard * * The number of records specified will be discarded during the parsing process. * * @see csv_parser::_skip_lines() * @see csv_parser::get_row() * @see csv_parser::has_more_rows() * * @param[in] lines_to_skip How many records should be skipped * @return void */ void set_skip_lines(unsigned int lines_to_skip) { ignore_num_lines = lines_to_skip; } /** * Return the current row from the CSV file * * The row is returned as a vector of string objects. * * This method should be called only if csv_parser::has_more_rows() is true * * @see csv_parser::has_more_rows() * @see csv_parser::get_record_count() * @see csv_parser::reset_record_count() * @see csv_parser::more_rows * * @return csv_row A vector type containing an array of strings */ csv_row get_row(void); /** * Returns the number of times the csv_parser::get_row() method has been invoked * * @see csv_parser::reset_record_count() * @return unsigned int The number of times the csv_parser::get_row() method has been invoked. */ unsigned int get_record_count(void) { return record_count; } /** * Resets the record_count internal attribute to zero * * This may be used if the object is reused multiple times. * * @see csv_parser::record_count * @see csv_parser::get_record_count() * @return void */ void reset_record_count(void) { record_count = 0U; } private : /** * Ignores N records in the CSV file * * Where N is the value of the csv_parser::ignore_num_lines internal property. * * The number of lines skipped can be defined by csv_parser::set_skip_lines() * * @see csv_parser::set_skip_lines() * * @return void */ void _skip_lines(void); /** * Reads a Single Line * * Reads a single record into the buffer passed by reference to the method * * @param[in,out] buffer A pointer to a character array for the current line. * @param[out] buffer_len A pointer to an integer storing the length of the buffer. * @return void */ void _read_single_line(char ** buffer, unsigned int * buffer_len); /** * Extracts the fields without enclosures * * This is used when the enclosure character is not set * @param[out] row The vector of strings * @param[in] line The character array buffer containing the current record/line * @param[in] line_length The length of the buffer */ void _get_fields_without_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); /** * Extracts the fields with enclosures * * This is used when the enclosure character is set. * * @param[out] row The vector of strings * @param[in] line The character array buffer containing the current record/line * @param[in] line_length The length of the buffer */ void _get_fields_with_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); /** * Extracts the fields when enclosure is optional * * This is used when the enclosure character is optional * * Hence, there could be fields that use it, and fields that don't. * * @param[out] row The vector of strings * @param[in] line The character array buffer containing the current record/line * @param[in] line_length The length of the buffer */ void _get_fields_with_optional_enclosure(csv_row_ptr row, const char * line, const unsigned int * line_length); protected : /** * The enclosure character * * If present or used for a field it is assumed that both ends of the fields are wrapped. * * This is that single character used in the document to wrap the fields. * * @see csv_parser::_get_fields_without_enclosure() * @see csv_parser::_get_fields_with_enclosure() * @see csv_parser::_get_fields_with_optional_enclosure() * * @var enclosed_char */ char enclosed_char; /** * The escape character * * For now the only valid escape character allowed is the backslash character 0x5C * * This is only important when the enclosure character is required or optional. * * This is the backslash character used to escape enclosure characters found within the fields. * * @see csv_parser::_get_fields_with_enclosure() * @see csv_parser::_get_fields_with_optional_enclosure() * @todo Update the code to accept other escape characters besides the backslash * * @var escaped_char */ char escaped_char; /** * The field terminator * * This is the single character used to mark the end of a column in the text file. * * Common characters used include the comma, tab, and semi-colons. * * This is the single character used to separate fields within a record. * * @var field_term_char */ char field_term_char; /** * The record terminator * * This is the single character used to mark the end of a record in the text file. * * The most popular one is the new line character however it is possible to use others as well. * * This is the single character used to mark the end of a record * * @see csv_parser::get_row() * * @var line_term_char */ char line_term_char; /** * Enclosure length * * This is the length of the enclosure character * * @see csv_parser::csv_parser() * @see csv_parser::set_enclosed_char() * * @var enclosed_length */ unsigned int enclosed_length; /** * The length of the escape character * * Right now this is really not being used. * * It may be used in future versions of the object. * * @todo Update the code to accept other escape characters besides the backslash * * @var escaped_length */ unsigned int escaped_length; /** * Length of the field terminator * * For now this is not being used. It will be used in future versions of the object. * * @var field_term_length */ unsigned int field_term_length; /** * Length of the record terminator * * For now this is not being used. It will be used in future versions of the object. * * @var line_term_length */ unsigned int line_term_length; /** * Number of records to discard * * This variable controls how many records in the file are skipped before parsing begins. * * @see csv_parser::_skip_lines() * @see csv_parser::set_skip_lines() * * @var ignore_num_lines */ unsigned int ignore_num_lines; /** * Number of times the get_row() method has been called * * @see csv_parser::get_row() * @var record_count */ unsigned int record_count; /** * The CSV File Pointer * * This is the pointer to the CSV file * * @var input_fp */ FILE * input_fp; /** * Buffer to input file name * * This buffer is used to store the name of the file that is being parsed * * @var input_filename */ char * input_filename; /** * Mode in which the CSV file will be parsed * * The various values are explained below * * @li ENCLOSURE_NONE (1) means the CSV file does not use any enclosure characters for the fields * @li ENCLOSURE_REQUIRED (2) means the CSV file requires enclosure characters for all the fields * @li ENCLOSURE_OPTIONAL (3) means the use of enclosure characters for the fields is optional * * @see csv_parser::get_row() * @see csv_parser::_read_single_line() * @see csv_parser::_get_fields_without_enclosure() * @see csv_parser::_get_fields_with_enclosure() * @see csv_parser::_get_fields_with_optional_enclosure() * * @var enclosure_type */ enclosure_type_t enclosure_type; /** * There are still more records to parse * * This boolean property is an internal indicator of whether there are still records in the * file to be parsed. * * @see csv_parser::has_more_rows() * @var more_rows */ bool more_rows; }; /* class csv_parser */ #endif /* CSV_PARSER_HPP_INCLUDED */