18bf80f4bSopenharmony_ci/*
28bf80f4bSopenharmony_ci * Copyright (c) 2024 Huawei Device Co., Ltd.
38bf80f4bSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
48bf80f4bSopenharmony_ci * you may not use this file except in compliance with the License.
58bf80f4bSopenharmony_ci * You may obtain a copy of the License at
68bf80f4bSopenharmony_ci *
78bf80f4bSopenharmony_ci *     http://www.apache.org/licenses/LICENSE-2.0
88bf80f4bSopenharmony_ci *
98bf80f4bSopenharmony_ci * Unless required by applicable law or agreed to in writing, software
108bf80f4bSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
118bf80f4bSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
128bf80f4bSopenharmony_ci * See the License for the specific language governing permissions and
138bf80f4bSopenharmony_ci * limitations under the License.
148bf80f4bSopenharmony_ci */
158bf80f4bSopenharmony_ci#include "csv_parser.h"
168bf80f4bSopenharmony_ci
178bf80f4bSopenharmony_ci#include <algorithm>
188bf80f4bSopenharmony_ci#include <cctype>
198bf80f4bSopenharmony_ci
208bf80f4bSopenharmony_ciMETA_BEGIN_NAMESPACE()
218bf80f4bSopenharmony_ci
228bf80f4bSopenharmony_ciCsvParser::CsvParser(BASE_NS::string_view csv, const char delimiter) : delimiter_(delimiter), csv_(csv) {}
238bf80f4bSopenharmony_ci
248bf80f4bSopenharmony_cibool CsvParser::GetRow(CsvRow& row)
258bf80f4bSopenharmony_ci{
268bf80f4bSopenharmony_ci    auto nextRow = ParseRow();
278bf80f4bSopenharmony_ci    row.swap(nextRow);
288bf80f4bSopenharmony_ci    return !row.empty();
298bf80f4bSopenharmony_ci}
308bf80f4bSopenharmony_ci
318bf80f4bSopenharmony_civoid CsvParser::Reset()
328bf80f4bSopenharmony_ci{
338bf80f4bSopenharmony_ci    pos_ = 0;
348bf80f4bSopenharmony_ci}
358bf80f4bSopenharmony_ci
368bf80f4bSopenharmony_ci/**
378bf80f4bSopenharmony_ci * @brief Returns a trimmed string based on state.
388bf80f4bSopenharmony_ci * @param sv The string to trim.
398bf80f4bSopenharmony_ci * @param state State of the parser.
408bf80f4bSopenharmony_ci * @return If state is QUOTED, returns the string itself. Otherwise returns the string
418bf80f4bSopenharmony_ci *         trimmed from trailing and leading whitespace.
428bf80f4bSopenharmony_ci */
438bf80f4bSopenharmony_ciBASE_NS::string_view CsvParser::Trimmed(BASE_NS::string_view sv, State state)
448bf80f4bSopenharmony_ci{
458bf80f4bSopenharmony_ci    if (state == QUOTED) {
468bf80f4bSopenharmony_ci        return sv;
478bf80f4bSopenharmony_ci    }
488bf80f4bSopenharmony_ci    constexpr auto nspace = [](unsigned char ch) { return !std::isspace(static_cast<int>(ch)); };
498bf80f4bSopenharmony_ci    sv.remove_suffix(std::distance(std::find_if(sv.rbegin(), sv.rend(), nspace).base(), sv.end()));
508bf80f4bSopenharmony_ci    sv.remove_prefix(std::find_if(sv.begin(), sv.end(), nspace) - sv.begin());
518bf80f4bSopenharmony_ci    return sv;
528bf80f4bSopenharmony_ci}
538bf80f4bSopenharmony_ci
548bf80f4bSopenharmony_cistd::pair<bool, char> HandleEscaped(char next)
558bf80f4bSopenharmony_ci{
568bf80f4bSopenharmony_ci    std::pair<bool, char> result { true, next };
578bf80f4bSopenharmony_ci    switch (next) {
588bf80f4bSopenharmony_ci        case 'n':
598bf80f4bSopenharmony_ci            result.second = '\n';
608bf80f4bSopenharmony_ci            break;
618bf80f4bSopenharmony_ci        case '\\':
628bf80f4bSopenharmony_ci            result.second = '\\';
638bf80f4bSopenharmony_ci            break;
648bf80f4bSopenharmony_ci        case 't':
658bf80f4bSopenharmony_ci            result.second = '\t';
668bf80f4bSopenharmony_ci            break;
678bf80f4bSopenharmony_ci        case '"':
688bf80f4bSopenharmony_ci            result.second = '"';
698bf80f4bSopenharmony_ci            break;
708bf80f4bSopenharmony_ci        default:
718bf80f4bSopenharmony_ci            result.first = false;
728bf80f4bSopenharmony_ci            break;
738bf80f4bSopenharmony_ci    }
748bf80f4bSopenharmony_ci    return result;
758bf80f4bSopenharmony_ci}
768bf80f4bSopenharmony_ci
778bf80f4bSopenharmony_ciCsvParser::CsvRow CsvParser::ParseRow()
788bf80f4bSopenharmony_ci{
798bf80f4bSopenharmony_ci    BASE_NS::vector<BASE_NS::string> items;
808bf80f4bSopenharmony_ci    BASE_NS::string item;
818bf80f4bSopenharmony_ci    State state { NO_QUOTE };
828bf80f4bSopenharmony_ci
838bf80f4bSopenharmony_ci    while (pos_ < csv_.size()) {
848bf80f4bSopenharmony_ci        auto c = csv_[pos_++];
858bf80f4bSopenharmony_ci        if (c == '\r') { // Ignore carriage returns
868bf80f4bSopenharmony_ci            continue;
878bf80f4bSopenharmony_ci        }
888bf80f4bSopenharmony_ci        if (c == '"') {
898bf80f4bSopenharmony_ci            if (state == IN_QUOTE && pos_ < csv_.size() - 1 && csv_[pos_] == '"') {
908bf80f4bSopenharmony_ci                // Double quotes interpreted as a single quote
918bf80f4bSopenharmony_ci                item += c;
928bf80f4bSopenharmony_ci                pos_++;
938bf80f4bSopenharmony_ci            } else { // Begin/end quote
948bf80f4bSopenharmony_ci                state = (state == NO_QUOTE) ? IN_QUOTE : QUOTED;
958bf80f4bSopenharmony_ci                if (state == IN_QUOTE) {
968bf80f4bSopenharmony_ci                    // Quoted part starts, ignore anything before it
978bf80f4bSopenharmony_ci                    item.clear();
988bf80f4bSopenharmony_ci                }
998bf80f4bSopenharmony_ci            }
1008bf80f4bSopenharmony_ci        } else if (c == delimiter_ && state != IN_QUOTE) {
1018bf80f4bSopenharmony_ci            // Delimiter found while not within quotes, move to next item
1028bf80f4bSopenharmony_ci            items.emplace_back(Trimmed(item, state));
1038bf80f4bSopenharmony_ci            item.clear();
1048bf80f4bSopenharmony_ci            state = NO_QUOTE;
1058bf80f4bSopenharmony_ci        } else if (c == '\n' && state != IN_QUOTE) {
1068bf80f4bSopenharmony_ci            // End of line while not within quotes, the row is complete
1078bf80f4bSopenharmony_ci            break;
1088bf80f4bSopenharmony_ci        } else if (state != QUOTED) {
1098bf80f4bSopenharmony_ci            // By default include character in result, unless we already had
1108bf80f4bSopenharmony_ci            // quoted content, then anything outside of quotes is ignored until
1118bf80f4bSopenharmony_ci            // next delimiter
1128bf80f4bSopenharmony_ci            if (c == '\\' && pos_ < csv_.size() - 1) {
1138bf80f4bSopenharmony_ci                if (auto esc = HandleEscaped(csv_[pos_]); esc.first) {
1148bf80f4bSopenharmony_ci                    item += esc.second;
1158bf80f4bSopenharmony_ci                    pos_++;
1168bf80f4bSopenharmony_ci                    continue;
1178bf80f4bSopenharmony_ci                }
1188bf80f4bSopenharmony_ci            }
1198bf80f4bSopenharmony_ci            item += c;
1208bf80f4bSopenharmony_ci        }
1218bf80f4bSopenharmony_ci    }
1228bf80f4bSopenharmony_ci
1238bf80f4bSopenharmony_ci    // Any leftover since the last delimiter is the last item on the row
1248bf80f4bSopenharmony_ci    if (auto trimmed = Trimmed(item, state); !trimmed.empty()) {
1258bf80f4bSopenharmony_ci        items.emplace_back(trimmed);
1268bf80f4bSopenharmony_ci    }
1278bf80f4bSopenharmony_ci    return items;
1288bf80f4bSopenharmony_ci}
1298bf80f4bSopenharmony_ci
1308bf80f4bSopenharmony_ciMETA_END_NAMESPACE()
131