Browse Source

fix favicons

main
matthew 2 years ago
parent
commit
b535b292df
  1. 38
      searching-front/services/modules/parser/index.ts

38
searching-front/services/modules/parser/index.ts

@ -23,38 +23,36 @@ const isInvalidLink = (link: string) => {
return link.match(/\./) && !(link.match(/\.html/) || link.match(/\.htm/)) return link.match(/\./) && !(link.match(/\.html/) || link.match(/\.htm/))
} }
const getFaviconUrl = (dom:JSDOM, domain:string) => { const getFaviconUrl = (dom: JSDOM, domain: string) => {
try{
const node = dom.window.document.querySelector("[rel=icon][type*=image]") as HTMLLinkElement;;
const href = node?.href;
let url;
if(href){
try { try {
url = new URL(href); const node = dom.window.document.querySelector("[rel=icon][type*=image]") as HTMLLinkElement
} catch(e){ const href = node?.href
url = new URL(domain+'/'+href); let url
url.pathname = url.pathname.replaceAll('//','/'); if (href) {
try {
url = new URL(href)
} catch (e) {
url = new URL(domain + "/" + href)
url.pathname = url.pathname.replaceAll("//", "/")
} }
return url.toString(); return url.toString()
} }
} catch(e){ } catch (e) {
return undefined return undefined
} }
} }
class Parser { class Parser {
constructor() {} constructor() {}
parseUrl = async (url: string, domain:string) => { parseUrl = async (url: string, domain: string) => {
try { try {
const { data, headers } = await axios.get(url, {
const { data, headers } = await axios.get(url,{
proxy: getTonProxy(), proxy: getTonProxy(),
}) })
const contentType = headers["content-type"].toLocaleLowerCase() const contentType = headers["content-type"].toLocaleLowerCase()
if (!contentType.startsWith('text/html')) { if (!contentType.startsWith("text/html")) {
return SHOULD_NOT_PARSE return SHOULD_NOT_PARSE
} }
@ -66,7 +64,7 @@ class Parser {
dom.window.document.querySelectorAll("a").forEach(({ href }) => { dom.window.document.querySelectorAll("a").forEach(({ href }) => {
if (isInnerLink(href)) { if (isInnerLink(href)) {
const url = new URL("ton://a.ton" + href) const url = new URL("ton://a.ton" + href)
if (!isInvalidLink(url.pathname) && [...subPagesSet].length < 50 ) { if (!isInvalidLink(url.pathname) && [...subPagesSet].length < 50) {
subPagesSet.add(url.pathname) subPagesSet.add(url.pathname)
} }
} }
@ -86,12 +84,12 @@ class Parser {
.querySelector("meta[name='description']") .querySelector("meta[name='description']")
?.getAttribute("content") || "", ?.getAttribute("content") || "",
url, url,
faviconUrl: getFaviconUrl(dom, domain) faviconUrl: getFaviconUrl(dom, domain),
}, },
subPages, subPages,
} }
} catch (e) { } catch (e) {
console.log("Parse error ",e, url) console.log("Parse error ", e?.code, url)
return SHOULD_NOT_PARSE return SHOULD_NOT_PARSE
} }
} }

Loading…
Cancel
Save