misc/tools/redmine-exporter/redmine-exporter.py

   1 #!/usr/bin/env python2
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright: © 2014 "nyov"
   5 # License:   Expat
   6 #
   7 # This script will crawl a Redmine wiki website and write all the history
   8 # of all pages found to a single branch inside a Git repository.
   9 #
  10 # The script will create a git repository in your working directory.
  11 # It requires the scrapy (0.24) and pygit2 python packages.
  12 # Aside from that it needs enough memory to hold all the records in
  13 # memory until it can sort them by date and version and flush the
  14 # git tree history in correct order to disk only at the very end.
  15 #
  16 # Created for importing from static html pages of a redmine wiki,
  17 # (so some workarounds exist, for missing pages, in how the crawl runs)
  18 # but should work on or easily be adaptable to the real thing.
  19
  20 import scrapy
  21 from scrapy import log
  22 from scrapy.contrib.linkextractors import LinkExtractor
  23 from scrapy.http import Request, HtmlResponse
  24 from scrapy.selector import Selector
  25
  26 import urlparse
  27 import urllib
  28 import re
  29
  30 import datetime
  31 #from dateutil.parser import parse
  32
  33 # for git imports
  34 import pygit2
  35 import heapq
  36 import calendar
  37 import time
  38
  39 ################
  40 ### SETTINGS ###
  41 ################
  42
  43 BOT_NAME = 'RedmineExporter'
  44 BOT_VERSION = '1.0'
  45 # how to identify to the target website
  46 USER_AGENT = '%s/%s (+http://www.yourdomain.com)' % (BOT_NAME, BOT_VERSION)
  47 # how many parallel connections to keep open to the target website
  48 CONCURRENT_REQUESTS = 16
  49
  50 # show duplicate (dropped) requests
  51 DUPEFILTER_DEBUG = False
  52 # for debugging log level see end of file
  53 ################
  54
  55 def read_git_authors(file):
  56         """Read a git (git-svn) authors.txt file
  57
  58         which has the line format:
  59         handle = Full Name <and@some.email>
  60         """
  61         authors = {}
  62         try:
  63                 with open(file) as f:
  64                         data = f.readlines()
  65                         data = (l for l in data if not l.startswith('#'))
  66                         for line in data: # if not line.startswith('#'):
  67                                 name, handle = line.strip().split(' = ')
  68                                 author, email = handle.rstrip('>').split(' <')
  69                                 authors[name] = (author, email)
  70                                 #print('\t%s => "%s" [%s]' % (name, author, email))
  71         except IOError: pass
  72         return authors
  73
  74
  75 class RedmineUser(scrapy.Item):
  76         author = scrapy.Field()
  77         email = scrapy.Field()
  78
  79
  80 class RedminePage(scrapy.Item):
  81         pagename = scrapy.Field()
  82         version = scrapy.Field()
  83         lastversion = scrapy.Field()
  84         updated = scrapy.Field()
  85         user = scrapy.Field()
  86         comment = scrapy.Field()
  87         content = scrapy.Field()
  88         # debug
  89         url = scrapy.Field()
  90
  91
  92 class RedmineExportSpider(scrapy.Spider):
  93         """Xonotic Redmine exporter"""
  94
  95         name = BOT_NAME
  96         allowed_domains = ['dev.xonotic.org']
  97         start_urls = (
  98                 # wiki's 'Index by title' page
  99                 'http://dev.xonotic.org/projects/xonotic/wiki/index.html',
 100                 # this page does not appear in the overview, wtf! I don't even...
 101                 # oh, it's been renamed
 102                 'http://dev.xonotic.org/projects/xonotic/wiki/IRC.html',
 103         )
 104
 105         def start_requests(self):
 106                 for link in self.start_urls[:1]: # index
 107                         yield Request(url=link, callback=self.parse_index)
 108                 for link in self.start_urls[1:]: # any other links
 109                         yield Request(url=link, callback=self.parse_pages)
 110
 111         def parse_index(self, response):
 112                 l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]'))
 113                 for link in l.extract_links(response):
 114                         yield Request(link.url, callback=self.parse_pages)
 115
 116         def parse_pages(self, response):
 117                 url, = response.xpath('//div[@id="wrapper"]//div[@id="content"]//a[contains(@class, "icon-history")]/@href').extract()[:1] or [None]
 118                 return Request(urlparse.urljoin(response.url, url), callback=self.parse_history_entry)
 119
 120         def parse_history_entry(self, response):
 121                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
 122                 paginated, = page.xpath('.//span[@class="pagination"]/a[contains(text(), "Next")]/@href').extract()[:1] or [None]
 123                 if paginated:
 124                         # re-entry, missing pages workaround
 125                         full, = page.xpath('.//span[@class="pagination"]/a[last()]/@href').extract()
 126                         return Request(urlparse.urljoin(response.url, full), callback=self.parse_history)
 127                         # missing recursion for more pages (200+ revisions)
 128                 else:
 129                         return self.parse_history(response)
 130
 131         def parse_history(self, response):
 132                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
 133                 history = page.xpath('.//form//table/tbody/tr')
 134                 pagename = re.match(r'.*/wiki/(.*)/history', response.url).group(1)
 135                 lastversion = page.xpath('.//form//table/tbody/tr[1]/td[1]/a/text()').extract()[0]
 136                 for row in history:
 137                         i = RedminePage()
 138                         i['pagename'] = pagename
 139                         i['version'], = row.xpath('td[@class="id"]/a/text()').extract()[:1] or [None]
 140                         i['version'] = int(i['version'])
 141                         i['lastversion'] = int(lastversion)
 142                         date, = row.xpath('td[@class="updated_on"]/text()').extract()
 143                         # date parse, assume UTC
 144                         #i['updated'] = parse(date)
 145                         i['updated'] = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M %p")
 146                         i['user'], = row.xpath('td[@class="author"]/a[contains(@class, "user")]/text()').extract()[:1] or [None]
 147                         userpage, = row.xpath('td[@class="author"]/a[contains(@class, "user")]/@href').extract()[:1] or [None]
 148                         if userpage is not None:
 149                                 yield Request(urlparse.urljoin(response.url, userpage), callback=self.parse_user)
 150                         i['comment'], = row.xpath('td[@class="comments"]/text()').extract()[:1] or [None]
 151                         content, = row.xpath('td[@class="buttons"]//a[contains(@href, "annotate.html")]/@href').extract()[:1] or [None]
 152                         request = Request(urlparse.urljoin(response.url, content), callback=self.parse_page)
 153                         request.meta['item'] = i
 154                         yield request
 155
 156         def parse_user(self, response):
 157                 i = RedmineUser()
 158                 user = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
 159                 i['author'], = user.xpath('h2/text()').extract()[:1] or [None]
 160                 i['author'] = i['author'].strip()
 161                 #i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/a[contains(@href, "mailto")]/text()').extract()[:1] or [None]
 162                 i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/script/text()').re(r'.*\'(.*)\'')[:1] or [None]
 163                 if not i['email']:
 164                         i['email'] = '%s@' % i['author']
 165                 else:
 166                         email = urllib.unquote(i['email']).lstrip('document.write(\'').rstrip('\');').decode('string_escape').replace('\\/', '/')
 167                         fake = Selector(HtmlResponse(response.url, encoding='utf-8', body=email))
 168                         i['email'], = fake.xpath('//a/text()').extract()[:1] or [None]
 169                 return i
 170
 171         def parse_page(self, response):
 172                 i = response.meta['item']
 173                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
 174                 lines = page.xpath('table[contains(@class, "filecontent")]//tr/td[@class="line-code"]') # keep empty lines!
 175                 i['url'] = response.url
 176                 i['content'] = ''
 177                 for line in lines:
 178                         line = (line.xpath('pre/text()').extract() or [u''])[0]
 179                         i['content'] += line + '\n'
 180
 181                 return i
 182
 183
 184
 185 class GitImportPipeline(object):
 186         """Git dumper"""
 187
 188         def __init__(self, *a, **kw):
 189                 self.repo = pygit2.init_repository('wiki.git', False) # non-bare repo
 190                 self.heap = [] # heap for sorting commits
 191                 self.committer = pygit2.Signature('RedmineExport', 'redmineexport@dev.xonotic.org', encoding='utf-8')
 192                 self.users = {}
 193
 194         def open_spider(self, spider):
 195                 self.users = read_git_authors("redmine-authors.txt")
 196
 197         def close_spider(self, spider):
 198                 self.write_git(spider)
 199
 200         def process_item(self, i, spider):
 201                 if isinstance(i, RedmineUser):
 202                         # prefer pre-loaded identities from local file
 203                         if i['author'] not in self.users:
 204                                 self.users[i['author']] = (i['author'], i['email'])
 205                         log.msg("Scraped user %s" % (i['author'],), spider=spider, level=log.INFO)
 206
 207                 if isinstance(i, RedminePage):
 208                         oid = self.repo.create_blob(i['content'].encode("utf8"))
 209                         ts = calendar.timegm(i['updated'].utctimetuple()) # datetime to unix timestamp for sorting
 210                         heapq.heappush(self.heap, (ts, i['version'], oid, i))
 211                         log.msg('Scraped page "%s" @ %s' % (i['pagename'], i['version']), spider=spider, level=log.INFO)
 212
 213                 return i
 214
 215         def write_git(self, spider):
 216                 parent = parent_id = None
 217                 for _ in range(len(self.heap)):
 218                         (ts, vsn, oid, i) = heapq.heappop(self.heap)
 219
 220                         commit_comment = i['comment'] or u''
 221                         add_comment = u'\n\n(Commit created by redmine exporter script from page "%s" version %s)' % (i['pagename'], i['version'])
 222
 223                         if parent:
 224                                 tb = self.repo.TreeBuilder(parent.tree) # treeish ~= filesystem folder
 225                         else:
 226                                 tb = self.repo.TreeBuilder()
 227
 228                         filename = '%s%s' % (i['pagename'], '.textile')
 229
 230                         tb.insert(filename, oid, pygit2.GIT_FILEMODE_BLOB)
 231                         tree = tb.write() # create updated treeish with current page blob added
 232
 233                         parents = []
 234                         if parent is not None:
 235                                 parents = [parent_id]
 236
 237                         (user, email) = self.users[i['user']]
 238                         author = pygit2.Signature(user, email, time=ts, offset=0, encoding='utf-8')
 239
 240                         log.msg("Committing %s @ %s (%s)" % (i['pagename'], i['version'], oid), spider=spider, level=log.INFO)
 241                         cid = self.repo.create_commit(
 242                                 'refs/heads/master',
 243                                 author, self.committer, commit_comment + add_comment, tree, parents, 'utf-8'
 244                         )
 245                         # commit is new parent for next commit
 246                         parent = self.repo.get(cid)
 247                         parent_id = cid
 248
 249
 250 ITEM_PIPELINES = { # HAXX :D
 251         GitImportPipeline: 800,
 252 }
 253
 254 # haxx: sad monkeypatch, might break
 255 from importlib import import_module
 256 def load_object(path):
 257         try:
 258                 dot = path.rindex('.')
 259         except ValueError:
 260                 raise ValueError("Error loading object '%s': not a full path" % path)
 261         except AttributeError:
 262                 return path # hax
 263
 264         module, name = path[:dot], path[dot+1:]
 265         mod = import_module(module)
 266
 267         try:
 268                 obj = getattr(mod, name)
 269         except AttributeError:
 270                 raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
 271
 272         return obj
 273
 274 scrapy.utils.misc.load_object = load_object
 275 # end haxx
 276
 277 from scrapy.exceptions import DontCloseSpider
 278 def finished_run():
 279         log.msg("""
 280 ┌───────────────────────────────────────┐
 281 │           finished run                │
 282 │                                       │
 283 │ VERIFY IT REALLY FOUND ALL YOUR PAGES │
 284 │      OR YOU WILL BE SORRY LATER       │
 285 │                                       │
 286 │ if it was successful, you now want to │
 287 │ repack the dumped git object database:│
 288 │                                       │
 289 │ $ git reflog expire --expire=now --all│
 290 │ $ git gc --prune=now                  │
 291 │ $ git repack -A -d                    │
 292 │ $ git gc --aggressive --prune=now     │
 293 └───────────────────────────────────────┘
 294         """, spider=spider, level=log.INFO)
 295
 296
 297 if __name__ == "__main__":
 298         # for scrapy 0.24
 299         from twisted.internet import reactor
 300         from scrapy.utils.project import get_project_settings
 301         from scrapy.crawler import Crawler
 302         from scrapy import log, signals
 303
 304         import sys
 305
 306         print("""
 307         ┌───────────────────────────────────────┐
 308         │        Redmine Exporter script        │
 309         ├───────────────────────────────────────┤
 310         │  handle with care,                    │
 311         │        don't kill your webserver,     │
 312         │                             ...enjoy  │
 313         └───────────────────────────────────────┘
 314         """)
 315         raw_input("Hit Enter to continue...")
 316
 317         spider = RedmineExportSpider()
 318         settings = get_project_settings()
 319         settings.set('BOT_NAME', BOT_NAME, priority='cmdline')
 320         settings.set('USER_AGENT', USER_AGENT, priority='cmdline')
 321         settings.set('ITEM_PIPELINES', ITEM_PIPELINES, priority='cmdline')
 322         settings.set('CONCURRENT_REQUESTS', CONCURRENT_REQUESTS, priority='cmdline')
 323         settings.set('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG, priority='cmdline')
 324         crawler = Crawler(settings)
 325         crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
 326         crawler.signals.connect(finished_run, signal=signals.spider_closed)
 327         crawler.configure()
 328         crawler.crawl(spider)
 329         crawler.start()
 330 #       log.start(loglevel=log.DEBUG)
 331         log.start(loglevel=log.INFO)
 332         log.msg("Starting run ...", spider=spider, level=log.INFO)
 333         reactor.run()