You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
76 lines
2.1 KiB
76 lines
2.1 KiB
2 years ago
|
import dotenv from "dotenv"
|
||
|
import path from "path"
|
||
|
import { JSDOM } from "jsdom"
|
||
|
import axios from "axios"
|
||
|
import { convert } from "html-to-text"
|
||
|
import { load } from "cheerio"
|
||
|
import textversion from "textversionjs"
|
||
|
import { ElasticIndexParams } from "../elastic"
|
||
|
import { htmlToText, SHOULD_NOT_PARSE } from "./helpers"
|
||
|
import { URL } from "url"
|
||
|
|
||
|
interface ParseUrlResult {
|
||
|
elasticData: ElasticIndexParams
|
||
|
subPages: Record<string, boolean>
|
||
|
}
|
||
|
|
||
|
const isInnerLink = (link: string) => {
|
||
|
console.log(link)
|
||
|
return link.startsWith("/")
|
||
|
}
|
||
|
|
||
|
const isInvalidLink = (link: string) => {
|
||
|
return link.match(/\./) && !(link.match(/\.html/) || link.match(/\.htm/))
|
||
|
}
|
||
|
|
||
|
class Parser {
|
||
|
constructor() {}
|
||
|
parseUrl = async (url: string) => {
|
||
|
try {
|
||
|
const { data, headers } = await axios.get(url)
|
||
|
|
||
|
const contentType = headers["content-type"].toLocaleLowerCase()
|
||
|
if (contentType !== "text/html; charset=utf-8") {
|
||
|
return SHOULD_NOT_PARSE
|
||
|
}
|
||
|
|
||
|
const dom = new JSDOM(data)
|
||
|
|
||
|
const subPagesSet = new Set()
|
||
|
|
||
|
// собираем все ссылки и складываем их в сэт для дальнейшей обработки
|
||
|
dom.window.document.querySelectorAll("a").forEach(({ href }) => {
|
||
|
if (isInnerLink(href)) {
|
||
|
const url = new URL("ton://a.ton" + href)
|
||
|
if (!isInvalidLink(url.pathname)) {
|
||
|
subPagesSet.add(url.pathname)
|
||
|
}
|
||
|
}
|
||
|
})
|
||
|
const subPages = [...subPagesSet].reduce(
|
||
|
(acc, item) => ({ ...acc, [item]: false }),
|
||
|
{}
|
||
|
) as Record<string, boolean>
|
||
|
|
||
|
return {
|
||
|
elasticData: {
|
||
|
title: dom.window.document.title,
|
||
|
h1: dom.window.document.querySelector("h1")?.textContent || undefined,
|
||
|
bodyText: htmlToText(data),
|
||
|
description:
|
||
|
dom.window.document
|
||
|
.querySelector("meta[name='description']")
|
||
|
?.getAttribute("content") || "",
|
||
|
url,
|
||
|
},
|
||
|
subPages,
|
||
|
}
|
||
|
} catch (e) {
|
||
|
console.log("Error", e)
|
||
|
return SHOULD_NOT_PARSE
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
export default new Parser()
|