Searching.ton
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
2.7 KiB

2 years ago
import dotenv from "dotenv"
import path from "path"
import { JSDOM } from "jsdom"
import axios from "axios"
import { convert } from "html-to-text"
import { load } from "cheerio"
import textversion from "textversionjs"
import { ElasticIndexParams } from "../elastic"
import { htmlToText, SHOULD_NOT_PARSE } from "./helpers"
import { URL } from "url"
2 years ago
import { getTonProxy } from "../../helpers"
2 years ago
interface ParseUrlResult {
elasticData: ElasticIndexParams
subPages: Record<string, boolean>
}
const isInnerLink = (link: string) => {
return link.startsWith("/")
}
const isInvalidLink = (link: string) => {
return link.match(/\./) && !(link.match(/\.html/) || link.match(/\.htm/))
}
2 years ago
const getFaviconUrl = (dom: JSDOM, domain: string) => {
try {
const node = dom.window.document.querySelector("[rel=icon][type*=image]") as HTMLLinkElement
const href = node?.href
let url
if (href) {
2 years ago
try {
2 years ago
url = new URL(href)
} catch (e) {
url = new URL(domain + "/" + href)
url.pathname = url.pathname.replaceAll("//", "/")
2 years ago
}
2 years ago
return url.toString()
2 years ago
}
2 years ago
} catch (e) {
2 years ago
return undefined
}
}
2 years ago
class Parser {
constructor() {}
2 years ago
parseUrl = async (url: string, domain: string) => {
2 years ago
try {
2 years ago
const { data, headers } = await axios.get(url, {
2 years ago
proxy: getTonProxy(),
})
2 years ago
2 years ago
const contentType = headers["content-type"].toLocaleLowerCase()
2 years ago
2 years ago
if (!contentType.startsWith("text/html")) {
2 years ago
return SHOULD_NOT_PARSE
}
const dom = new JSDOM(data)
const subPagesSet = new Set()
// собираем все ссылки и складываем их в сэт для дальнейшей обработки
dom.window.document.querySelectorAll("a").forEach(({ href }) => {
if (isInnerLink(href)) {
const url = new URL("ton://a.ton" + href)
2 years ago
if (!isInvalidLink(url.pathname) && [...subPagesSet].length < 50) {
2 years ago
subPagesSet.add(url.pathname)
}
}
})
const subPages = [...subPagesSet].reduce(
(acc, item) => ({ ...acc, [item]: false }),
{}
) as Record<string, boolean>
return {
elasticData: {
title: dom.window.document.title,
h1: dom.window.document.querySelector("h1")?.textContent || undefined,
bodyText: htmlToText(data),
description:
dom.window.document
.querySelector("meta[name='description']")
?.getAttribute("content") || "",
url,
2 years ago
faviconUrl: getFaviconUrl(dom, domain),
2 years ago
},
subPages,
}
} catch (e) {
2 years ago
console.log("Parse error ", e?.code, url)
2 years ago
return SHOULD_NOT_PARSE
}
}
}
export default new Parser()