diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 50aa9b6..f2c55f9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,8 @@ add_executable(${PROJECT_NAME} main.cpp Page.hpp Page.cpp + stringutil.hpp + stringutil.cpp ) target_link_libraries(${PROJECT_NAME} PRIVATE LibXml2::LibXml2 cpr::cpr) diff --git a/src/Page.cpp b/src/Page.cpp index 1d3dfa3..a280ec8 100644 --- a/src/Page.cpp +++ b/src/Page.cpp @@ -8,6 +8,9 @@ #include "Page.hpp" +#include + +#include "stringutil.hpp" #include "cpr/cpr.h" #include "libxml/HTMLparser.h" #include "libxml/xpath.h" @@ -32,13 +35,16 @@ Page &Page::operator-=(int) { std::string Page::str() const { std::string ret; - for (const std::string &_pageText: subpages) { - std::string pageText = _pageText; - pageText.erase(std::ranges::remove(pageText, '\t').begin(), pageText.end()); + + for (const std::string &pageText: subpages) { std::istringstream stream(pageText); std::string line; while (std::getline(stream, line)) ret += line + "\n"; + + string_utils::removeTabs(ret); + string_utils::limitConsecutiveWhitespace(ret, MAX_WHITESPACE); + string_utils::removeTrailingWhitespace(ret); } return ret; } @@ -63,7 +69,7 @@ std::string Page::url() const { } std::vector Page::fetchSubpages() const { - const cpr::Response response = cpr::Get(url()); + const cpr::Response response = cpr::Get(cpr::Url{url()}); if (response.status_code / 100 != 2) throw std::runtime_error("Page not found"); const htmlDocPtr doc = htmlReadMemory(response.text.c_str(), response.text.size(), nullptr, nullptr, diff --git a/src/Page.hpp b/src/Page.hpp index 0a37f99..f36ccb0 100644 --- a/src/Page.hpp +++ b/src/Page.hpp @@ -11,6 +11,7 @@ #include static constexpr uint_fast8_t DEFAULT_NUMBER = 100; +static constexpr uint_fast8_t MAX_WHITESPACE = 2; class Page { private: diff --git a/src/main.cpp b/src/main.cpp index 3b04661..451498b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,8 @@ #include +#include "Page.hpp" + int main(int argc, char *argv[]) { - std::cout << "Hello, world!" << std::endl; + const auto page = Page(); + std::cout << page.str() << std::endl; } diff --git a/src/stringutil.cpp b/src/stringutil.cpp new file mode 100644 index 0000000..3c594dd --- /dev/null +++ b/src/stringutil.cpp @@ -0,0 +1,72 @@ +#include "stringutil.hpp" + +#include +#include +#include + + +namespace string_utils { + bool isAllWhitespace(const std::string &str) { + return std::ranges::all_of(str, [](const unsigned char c) -> bool { + return std::isspace(c); + }); + } + + void removeTrailingWhitespace(std::string &str) { + auto shouldRemoveTrailingWhitespace = [&str]() -> bool { + std::size_t last_newline = str.find_last_of('\n'); + + if (last_newline == std::string::npos) + return isAllWhitespace(str); + + const std::string last_line = str.substr(last_newline + 1); + return isAllWhitespace(last_line); + }; + + while (shouldRemoveTrailingWhitespace()) { + const std::size_t last_newline = str.find_last_of('\n'); + + if (last_newline == std::string::npos) { + if (isAllWhitespace(str)) + str.clear(); + break; + } + + str.erase(last_newline); + } + } + + void removeTabs(std::string &str) { + std::erase(str, '\t'); + } + + void limitConsecutiveWhitespace(std::string &str, const uint_fast8_t maxWhitespace) { + std::istringstream stream(str); + std::string line; + std::ostringstream processedStream; + + uint_fast8_t whitespaceRow = 0; + bool hasAddedRealTextJet = false; + + while (std::getline(stream, line)) { + const bool onlySpace = isAllWhitespace(line); + + if (!hasAddedRealTextJet) { + if (onlySpace) + continue; // Skip leading empty lines + hasAddedRealTextJet = true; + } else if (onlySpace) { + whitespaceRow++; + } else { + whitespaceRow = 0; + } + + if (whitespaceRow > maxWhitespace) + continue; // Skip lines exceeding maxWhitespace + + processedStream << line << "\n"; + } + + str = processedStream.str(); + } +} diff --git a/src/stringutil.hpp b/src/stringutil.hpp new file mode 100644 index 0000000..47337c5 --- /dev/null +++ b/src/stringutil.hpp @@ -0,0 +1,46 @@ +#pragma once +#include + +/** + * @brief String utility functions. + */ +namespace string_utils { + + /** + * @brief Checks if a given string consists solely of whitespace characters. + * + * @param str The string to check. + * @return true If all characters in the string are whitespace. + * @return false Otherwise. + */ + bool isAllWhitespace(const std::string& str); + + /** + * @brief Removes trailing whitespace lines from the given string. + * + * This function removes all consecutive empty or whitespace-only lines at the end + * of the input string. + * + * @param str The string from which to remove trailing whitespace lines. + */ + void removeTrailingWhitespace(std::string& str); + + /** + * @brief Removes all tab characters from the given string. + * + * @param str The string from which to remove tab characters. + */ + void removeTabs(std::string& str); + + /** + * @brief Limits the number of consecutive whitespace lines in the given string. + * + * This function ensures that no more than a specified number of consecutive empty + * or whitespace-only lines exist in the string. + * + * @param str The string to process. + * @param maxWhitespace The maximum allowed consecutive whitespace lines. + */ + void limitConsecutiveWhitespace(std::string& str, uint_fast8_t maxWhitespace); + +}