Searching.ton
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
2.0 KiB

2 years ago
import dotenv from "dotenv"
import path from "path"
import { JSDOM } from "jsdom"
import axios from "axios"
import tinyld from "tinyld"
import languagedetect from "languagedetect"
dotenv.config({ path: path.resolve(__dirname, "../.env.local") })
import db from "../db/index"
import Elastic from "./modules/elastic"
import Parser from "./modules/parser"
import { SHOULD_NOT_PARSE } from "./modules/parser/helpers"
type SubPages = Record<string, boolean>
const findFirstNotIndexed = (subpages: SubPages = {}) => {
return Object.entries(subpages).find(([url, isIndexed]) => !isIndexed)?.[0]
}
const indexWebsite = async (domain: string, url: string, subpages: SubPages = {}) => {
if (!subpages[url]) {
const urlObj = new URL(domain + url)
const parseInfo = await Parser.parseUrl(urlObj.toString())
subpages[url] = true
let pages = {}
if (parseInfo !== SHOULD_NOT_PARSE) {
await Elastic.index(parseInfo.elasticData)
pages = {
...parseInfo.subPages,
...subpages,
}
} else {
pages = subpages
}
console.log(pages)
const firstNotIndexed = findFirstNotIndexed(pages)
if (firstNotIndexed) {
return await indexWebsite(domain, firstNotIndexed, pages)
}
} else {
const firstNotIndexed = findFirstNotIndexed(subpages)
if (firstNotIndexed) {
return await indexWebsite(domain, firstNotIndexed, subpages)
}
}
}
const main = async () => {
// await Elastic.initElastic()
// await Elastic.createIndex()
const domains = await db.domain.findMany()
if (domains) {
console.time("index")
for (const domain of domains) {
console.time("index" + domain.address)
await db.domain.update({
where: { address: domain.address },
data: { lastParse: new Date() },
})
await indexWebsite(domain.address, "/")
console.timeEnd("index" + domain.address)
}
console.timeEnd("index")
}
}
main()
.then(() => console.log("finish parser"))
.catch((e) => console.log("error in parserr", e))
export default {}