You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
74 lines
2.0 KiB
74 lines
2.0 KiB
2 years ago
|
import dotenv from "dotenv"
|
||
|
import path from "path"
|
||
|
import { JSDOM } from "jsdom"
|
||
|
import axios from "axios"
|
||
|
import tinyld from "tinyld"
|
||
|
import languagedetect from "languagedetect"
|
||
|
|
||
|
dotenv.config({ path: path.resolve(__dirname, "../.env.local") })
|
||
|
|
||
|
import db from "../db/index"
|
||
|
import Elastic from "./modules/elastic"
|
||
|
import Parser from "./modules/parser"
|
||
|
import { SHOULD_NOT_PARSE } from "./modules/parser/helpers"
|
||
|
|
||
|
type SubPages = Record<string, boolean>
|
||
|
|
||
|
const findFirstNotIndexed = (subpages: SubPages = {}) => {
|
||
|
return Object.entries(subpages).find(([url, isIndexed]) => !isIndexed)?.[0]
|
||
|
}
|
||
|
|
||
|
const indexWebsite = async (domain: string, url: string, subpages: SubPages = {}) => {
|
||
|
if (!subpages[url]) {
|
||
|
const urlObj = new URL(domain + url)
|
||
|
const parseInfo = await Parser.parseUrl(urlObj.toString())
|
||
|
subpages[url] = true
|
||
|
let pages = {}
|
||
|
if (parseInfo !== SHOULD_NOT_PARSE) {
|
||
|
await Elastic.index(parseInfo.elasticData)
|
||
|
pages = {
|
||
|
...parseInfo.subPages,
|
||
|
...subpages,
|
||
|
}
|
||
|
} else {
|
||
|
pages = subpages
|
||
|
}
|
||
|
console.log(pages)
|
||
|
const firstNotIndexed = findFirstNotIndexed(pages)
|
||
|
if (firstNotIndexed) {
|
||
|
return await indexWebsite(domain, firstNotIndexed, pages)
|
||
|
}
|
||
|
} else {
|
||
|
const firstNotIndexed = findFirstNotIndexed(subpages)
|
||
|
if (firstNotIndexed) {
|
||
|
return await indexWebsite(domain, firstNotIndexed, subpages)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const main = async () => {
|
||
|
// await Elastic.initElastic()
|
||
|
// await Elastic.createIndex()
|
||
|
const domains = await db.domain.findMany()
|
||
|
|
||
|
if (domains) {
|
||
|
console.time("index")
|
||
|
for (const domain of domains) {
|
||
|
console.time("index" + domain.address)
|
||
|
await db.domain.update({
|
||
|
where: { address: domain.address },
|
||
|
data: { lastParse: new Date() },
|
||
|
})
|
||
|
await indexWebsite(domain.address, "/")
|
||
|
console.timeEnd("index" + domain.address)
|
||
|
}
|
||
|
console.timeEnd("index")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
main()
|
||
|
.then(() => console.log("finish parser"))
|
||
|
.catch((e) => console.log("error in parserr", e))
|
||
|
|
||
|
export default {}
|