txtv/src/Page.cpp
2025-01-22 13:24:08 +01:00

122 lines
3.2 KiB
C++

// _____ _ _____
// |_ _|____ _| ||_ _|_ __
// | |/ _ \ \/ / __|| | \ \ / /
// | | __/> <| |_ | | \ V /
// |_|\___/_/\_\\__||_| \_/
// Author: Love Billenius <lovebillenius@disroot.org>
// License: GPL-3
#include "Page.hpp"
#include <iostream>
#include <format>
#include "stringutil.hpp"
#include "cpr/cpr.h"
#include "libxml/HTMLparser.h"
#include "libxml/xpath.h"
Page::Page(const uint_fast8_t number): number(number), subpages(fetchSubpages()) {
}
Page Page::operator--(int) const {
return Page(number - 1);
}
Page Page::operator++(int) const {
return Page(number + 1);
}
Page &Page::operator-=(int) {
number--;
refresh();
return *this;
}
std::string Page::str() const {
std::string ret;
for (const std::string &pageText: subpages) {
std::istringstream stream(pageText);
std::string line;
while (std::getline(stream, line))
ret += line + "\n";
string_utils::limitConsecutiveWhitespace(ret, MAX_WHITESPACE);
string_utils::removeTrailingWhitespace(ret);
}
return ret;
}
Page &Page::operator+=(int) {
number++;
refresh();
return *this;
}
bool Page::refresh() {
std::vector<std::string> newSubpages = fetchSubpages();
const bool replace = !contentEquals(newSubpages);
if (replace)
subpages = newSubpages;
return replace;
}
std::string Page::url() const {
return std::format("https://www.svt.se/svttext/web/pages/{}.html", number);
}
std::vector<std::string> Page::fetchSubpages() const {
const cpr::Response response = cpr::Get(cpr::Url{url()});
if (response.status_code / 100 != 2)
throw std::runtime_error("Page not found");
const htmlDocPtr doc = htmlReadMemory(response.text.c_str(), response.text.size(), nullptr, nullptr,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (doc == nullptr)
throw std::runtime_error("Could not parse HTML.");
const auto xpathCtx = xmlXPathNewContext(doc);
if (!xpathCtx) {
xmlFreeDoc(doc);
throw std::runtime_error("Could not create XPath context.");
}
const auto xpathObj = xmlXPathEvalExpression(
reinterpret_cast<const xmlChar *>("//div[contains(@class,'Content_screenreaderOnly')]"), xpathCtx);
if (!xpathObj) {
xmlXPathFreeContext(xpathCtx);
xmlFreeDoc(doc);
throw std::runtime_error("Could not evaluate XPath expression.");
}
std::vector<std::string> pages;
if (const xmlNodeSetPtr nodes = xpathObj->nodesetval) {
for (int i = 0; i < nodes->nodeNr; ++i) {
xmlChar *content = xmlNodeGetContent(nodes->nodeTab[i]);
if (!content)
continue;
pages.emplace_back(reinterpret_cast<const char *>(content));
xmlFree(content);
}
}
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
xmlFreeDoc(doc);
return pages;
}
bool Page::contentEquals(const std::vector<std::string> &subpagesOther) const {
if (subpagesOther.size() != subpages.size())
return false;
for (std::size_t i = 0; i < subpages.size(); i++)
if (subpages[i] != subpagesOther[i])
return false;
return true;
}