import dotenv from "dotenv" import path from "path" import { JSDOM } from "jsdom" import axios from "axios" import { convert } from "html-to-text" import { load } from "cheerio" import textversion from "textversionjs" import { ElasticIndexParams } from "../elastic" import { htmlToText, SHOULD_NOT_PARSE } from "./helpers" import { URL } from "url" import { getTonProxy } from "../../helpers" interface ParseUrlResult { elasticData: ElasticIndexParams subPages: Record } const isInnerLink = (link: string) => { return link.startsWith("/") } const isInvalidLink = (link: string) => { return link.match(/\./) && !(link.match(/\.html/) || link.match(/\.htm/)) } const getFaviconUrl = (dom:JSDOM, domain:string) => { try{ const node = dom.window.document.querySelector("[rel=icon][type*=image]") as HTMLLinkElement;; const href = node?.href; let url; if(href){ try { url = new URL(href); } catch(e){ url = new URL(domain+href); url.pathname = url.pathname.replace('//','/'); } return url.toString(); } } catch(e){ return undefined } } class Parser { constructor() {} parseUrl = async (url: string, domain:string) => { try { const { data, headers } = await axios.get(url,{ proxy: getTonProxy(), }) const contentType = headers["content-type"].toLocaleLowerCase() if (!contentType.startsWith('text/html')) { return SHOULD_NOT_PARSE } const dom = new JSDOM(data) const subPagesSet = new Set() // собираем все ссылки и складываем их в сэт для дальнейшей обработки dom.window.document.querySelectorAll("a").forEach(({ href }) => { if (isInnerLink(href)) { const url = new URL("ton://a.ton" + href) if (!isInvalidLink(url.pathname) && [...subPagesSet].length < 50 ) { subPagesSet.add(url.pathname) } } }) const subPages = [...subPagesSet].reduce( (acc, item) => ({ ...acc, [item]: false }), {} ) as Record return { elasticData: { title: dom.window.document.title, h1: dom.window.document.querySelector("h1")?.textContent || undefined, bodyText: htmlToText(data), description: dom.window.document .querySelector("meta[name='description']") ?.getAttribute("content") || "", url, faviconUrl: getFaviconUrl(dom, domain) }, subPages, } } catch (e) { console.log("Parse error ",e, url) return SHOULD_NOT_PARSE } } } export default new Parser()