base.js 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. const PCR = require('puppeteer-chromium-resolver')
  2. const showdown = require('showdown')
  3. const models = require('../../models')
  4. const BaseSpider = require('../base')
  5. const globalConfig = require('../../config')
  6. const config = require('../config')
  7. const logger = require('../../logger')
  8. showdown.setOption('tables', true)
  9. showdown.setOption('tasklists', true)
  10. showdown.setFlavor('github')
  11. class BaseImportSpider extends BaseSpider {
  12. constructor(platformName) {
  13. super(BaseSpider)
  14. if (!platformName) {
  15. throw new Error('platformId must not be empty')
  16. }
  17. this.platformName = platformName
  18. }
  19. async init() {
  20. // 平台
  21. this.platform = await models.Platform.findOne({ name: this.platformName })
  22. // PCR
  23. this.pcr = await PCR({
  24. revision: '',
  25. detectionPath: '',
  26. folderName: '.chromium-browser-snapshots',
  27. hosts: ['https://storage.googleapis.com', 'https://npm.taobao.org/mirrors'],
  28. retry: 3,
  29. silent: false
  30. })
  31. // 是否开启chrome浏览器调试
  32. const enableChromeDebug = await models.Environment.findOne({ _id: constants.environment.ENABLE_CHROME_DEBUG }).value
  33. // 浏览器
  34. this.browser = await this.pcr.puppeteer.launch({
  35. executablePath: this.pcr.executablePath,
  36. timeout: 60000,
  37. //如果是访问https页面 此属性会忽略https错误
  38. ignoreHTTPSErrors: true,
  39. devtools: false,
  40. headless: enableChromeDebug !== 'Y',
  41. })
  42. // 页面
  43. this.page = await this.browser.newPage()
  44. // 设置 浏览器视窗
  45. await this.page.setViewport({
  46. width: 1300,
  47. height: 938
  48. })
  49. // 配置
  50. this.config = config[this.platform.name]
  51. if (!config) {
  52. throw new Error(`config (platform: ${this.platform.name}) cannot be found`)
  53. }
  54. // 编辑器选择器
  55. this.editorSel = this.config.editorSel
  56. // showdown配置
  57. showdown.setOption('tables', true)
  58. showdown.setOption('tasklists', true)
  59. showdown.setFlavor('github')
  60. // markdown to html转换器
  61. this.converter = new showdown.Converter()
  62. }
  63. async fetchArticles() {
  64. // to be overridden
  65. }
  66. async fetch() {
  67. logger.info('fetching articles')
  68. await this.init()
  69. await this.setCookies()
  70. try {
  71. await this.page.goto(this.platform.url, { timeout: 60000 })
  72. } catch (e) {
  73. console.error(e)
  74. await this.browser.close()
  75. return []
  76. }
  77. await this.page.waitFor(5000)
  78. const articles = await this.fetchArticles()
  79. await this.browser.close()
  80. return articles
  81. }
  82. async importArticle(siteArticle) {
  83. // to be overridden
  84. }
  85. async import(siteArticles) {
  86. logger.info('importing articles')
  87. await this.init()
  88. await this.setCookies()
  89. for (let i = 0; i < siteArticles.length; i++) {
  90. const siteArticle = siteArticles[i]
  91. if (siteArticle.exists && siteArticle.associated) continue
  92. await this.importArticle(siteArticle)
  93. }
  94. await this.browser.close()
  95. logger.info('imported articles')
  96. }
  97. }
  98. module.exports = BaseImportSpider