]> git.xonotic.org Git - xonotic/xonotic.git/blob - misc/tools/redmine-exporter/redmine-exporter.py
Merge branch 'cmake-q3map2' into 'master'
[xonotic/xonotic.git] / misc / tools / redmine-exporter / redmine-exporter.py
1 #!/usr/bin/env python2
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright: © 2014 "nyov"
5 # License:   Expat
6 #
7 # This script will crawl a Redmine wiki website and write all the history
8 # of all pages found to a single branch inside a Git repository.
9 #
10 # The script will create a git repository in your working directory.
11 # It requires the scrapy (0.24) and pygit2 python packages.
12 # Aside from that it needs enough memory to hold all the records in
13 # memory until it can sort them by date and version and flush the
14 # git tree history in correct order to disk only at the very end.
15 #
16 # Created for importing from static html pages of a redmine wiki,
17 # (so some workarounds exist, for missing pages, in how the crawl runs)
18 # but should work on or easily be adaptable to the real thing.
19
20 import scrapy
21 from scrapy import log
22 from scrapy.contrib.linkextractors import LinkExtractor
23 from scrapy.http import Request, HtmlResponse
24 from scrapy.selector import Selector
25
26 import urlparse
27 import urllib
28 import re
29
30 import datetime
31 #from dateutil.parser import parse
32
33 # for git imports
34 import pygit2
35 import heapq
36 import calendar
37 import time
38
39 ################
40 ### SETTINGS ###
41 ################
42
43 BOT_NAME = 'RedmineExporter'
44 BOT_VERSION = '1.0'
45 # how to identify to the target website
46 USER_AGENT = '%s/%s (+http://www.yourdomain.com)' % (BOT_NAME, BOT_VERSION)
47 # how many parallel connections to keep open to the target website
48 CONCURRENT_REQUESTS = 16
49
50 # show duplicate (dropped) requests
51 DUPEFILTER_DEBUG = False
52 # for debugging log level see end of file
53 ################
54
55 def read_git_authors(file):
56         """Read a git (git-svn) authors.txt file
57
58         which has the line format:
59         handle = Full Name <and@some.email>
60         """
61         authors = {}
62         try:
63                 with open(file) as f:
64                         data = f.readlines()
65                         data = (l for l in data if not l.startswith('#'))
66                         for line in data: # if not line.startswith('#'):
67                                 name, handle = line.strip().split(' = ')
68                                 author, email = handle.rstrip('>').split(' <')
69                                 authors[name] = (author, email)
70                                 #print('\t%s => "%s" [%s]' % (name, author, email))
71         except IOError: pass
72         return authors
73
74
75 class RedmineUser(scrapy.Item):
76         author = scrapy.Field()
77         email = scrapy.Field()
78
79
80 class RedminePage(scrapy.Item):
81         pagename = scrapy.Field()
82         version = scrapy.Field()
83         lastversion = scrapy.Field()
84         updated = scrapy.Field()
85         user = scrapy.Field()
86         comment = scrapy.Field()
87         content = scrapy.Field()
88         # debug
89         url = scrapy.Field()
90
91
92 class RedmineExportSpider(scrapy.Spider):
93         """Xonotic Redmine exporter"""
94
95         name = BOT_NAME
96         allowed_domains = ['dev.xonotic.org']
97         start_urls = (
98                 # wiki's 'Index by title' page
99                 'http://dev.xonotic.org/projects/xonotic/wiki/index.html',
100                 # this page does not appear in the overview, wtf! I don't even...
101                 # oh, it's been renamed
102                 'http://dev.xonotic.org/projects/xonotic/wiki/IRC.html',
103         )
104
105         def start_requests(self):
106                 for link in self.start_urls[:1]: # index
107                         yield Request(url=link, callback=self.parse_index)
108                 for link in self.start_urls[1:]: # any other links
109                         yield Request(url=link, callback=self.parse_pages)
110
111         def parse_index(self, response):
112                 l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]'))
113                 for link in l.extract_links(response):
114                         yield Request(link.url, callback=self.parse_pages)
115
116         def parse_pages(self, response):
117                 url, = response.xpath('//div[@id="wrapper"]//div[@id="content"]//a[contains(@class, "icon-history")]/@href').extract()[:1] or [None]
118                 return Request(urlparse.urljoin(response.url, url), callback=self.parse_history_entry)
119
120         def parse_history_entry(self, response):
121                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
122                 paginated, = page.xpath('.//span[@class="pagination"]/a[contains(text(), "Next")]/@href').extract()[:1] or [None]
123                 if paginated:
124                         # re-entry, missing pages workaround
125                         full, = page.xpath('.//span[@class="pagination"]/a[last()]/@href').extract()
126                         return Request(urlparse.urljoin(response.url, full), callback=self.parse_history)
127                         # missing recursion for more pages (200+ revisions)
128                 else:
129                         return self.parse_history(response)
130
131         def parse_history(self, response):
132                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
133                 history = page.xpath('.//form//table/tbody/tr')
134                 pagename = re.match(r'.*/wiki/(.*)/history', response.url).group(1)
135                 lastversion = page.xpath('.//form//table/tbody/tr[1]/td[1]/a/text()').extract()[0]
136                 for row in history:
137                         i = RedminePage()
138                         i['pagename'] = pagename
139                         i['version'], = row.xpath('td[@class="id"]/a/text()').extract()[:1] or [None]
140                         i['version'] = int(i['version'])
141                         i['lastversion'] = int(lastversion)
142                         date, = row.xpath('td[@class="updated_on"]/text()').extract()
143                         # date parse, assume UTC
144                         #i['updated'] = parse(date)
145                         i['updated'] = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M %p")
146                         i['user'], = row.xpath('td[@class="author"]/a[contains(@class, "user")]/text()').extract()[:1] or [None]
147                         userpage, = row.xpath('td[@class="author"]/a[contains(@class, "user")]/@href').extract()[:1] or [None]
148                         if userpage is not None:
149                                 yield Request(urlparse.urljoin(response.url, userpage), callback=self.parse_user)
150                         i['comment'], = row.xpath('td[@class="comments"]/text()').extract()[:1] or [None]
151                         content, = row.xpath('td[@class="buttons"]//a[contains(@href, "annotate.html")]/@href').extract()[:1] or [None]
152                         request = Request(urlparse.urljoin(response.url, content), callback=self.parse_page)
153                         request.meta['item'] = i
154                         yield request
155
156         def parse_user(self, response):
157                 i = RedmineUser()
158                 user = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
159                 i['author'], = user.xpath('h2/text()').extract()[:1] or [None]
160                 i['author'] = i['author'].strip()
161                 #i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/a[contains(@href, "mailto")]/text()').extract()[:1] or [None]
162                 i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/script/text()').re(r'.*\'(.*)\'')[:1] or [None]
163                 if not i['email']:
164                         i['email'] = '%s@' % i['author']
165                 else:
166                         email = urllib.unquote(i['email']).lstrip('document.write(\'').rstrip('\');').decode('string_escape').replace('\\/', '/')
167                         fake = Selector(HtmlResponse(response.url, encoding='utf-8', body=email))
168                         i['email'], = fake.xpath('//a/text()').extract()[:1] or [None]
169                 return i
170
171         def parse_page(self, response):
172                 i = response.meta['item']
173                 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
174                 lines = page.xpath('table[contains(@class, "filecontent")]//tr/td[@class="line-code"]') # keep empty lines!
175                 i['url'] = response.url
176                 i['content'] = ''
177                 for line in lines:
178                         line = (line.xpath('pre/text()').extract() or [u''])[0]
179                         i['content'] += line + '\n'
180
181                 return i
182
183
184
185 class GitImportPipeline(object):
186         """Git dumper"""
187
188         def __init__(self, *a, **kw):
189                 self.repo = pygit2.init_repository('wiki.git', False) # non-bare repo
190                 self.heap = [] # heap for sorting commits
191                 self.committer = pygit2.Signature('RedmineExport', 'redmineexport@dev.xonotic.org', encoding='utf-8')
192                 self.users = {}
193
194         def open_spider(self, spider):
195                 self.users = read_git_authors("redmine-authors.txt")
196
197         def close_spider(self, spider):
198                 self.write_git(spider)
199
200         def process_item(self, i, spider):
201                 if isinstance(i, RedmineUser):
202                         # prefer pre-loaded identities from local file
203                         if i['author'] not in self.users:
204                                 self.users[i['author']] = (i['author'], i['email'])
205                         log.msg("Scraped user %s" % (i['author'],), spider=spider, level=log.INFO)
206
207                 if isinstance(i, RedminePage):
208                         oid = self.repo.create_blob(i['content'].encode("utf8"))
209                         ts = calendar.timegm(i['updated'].utctimetuple()) # datetime to unix timestamp for sorting
210                         heapq.heappush(self.heap, (ts, i['version'], oid, i))
211                         log.msg('Scraped page "%s" @ %s' % (i['pagename'], i['version']), spider=spider, level=log.INFO)
212
213                 return i
214
215         def write_git(self, spider):
216                 parent = parent_id = None
217                 for _ in range(len(self.heap)):
218                         (ts, vsn, oid, i) = heapq.heappop(self.heap)
219
220                         commit_comment = i['comment'] or u''
221                         add_comment = u'\n\n(Commit created by redmine exporter script from page "%s" version %s)' % (i['pagename'], i['version'])
222
223                         if parent:
224                                 tb = self.repo.TreeBuilder(parent.tree) # treeish ~= filesystem folder
225                         else:
226                                 tb = self.repo.TreeBuilder()
227
228                         filename = '%s%s' % (i['pagename'], '.textile')
229
230                         tb.insert(filename, oid, pygit2.GIT_FILEMODE_BLOB)
231                         tree = tb.write() # create updated treeish with current page blob added
232
233                         parents = []
234                         if parent is not None:
235                                 parents = [parent_id]
236
237                         (user, email) = self.users[i['user']]
238                         author = pygit2.Signature(user, email, time=ts, offset=0, encoding='utf-8')
239
240                         log.msg("Committing %s @ %s (%s)" % (i['pagename'], i['version'], oid), spider=spider, level=log.INFO)
241                         cid = self.repo.create_commit(
242                                 'refs/heads/master',
243                                 author, self.committer, commit_comment + add_comment, tree, parents, 'utf-8'
244                         )
245                         # commit is new parent for next commit
246                         parent = self.repo.get(cid)
247                         parent_id = cid
248
249
250 ITEM_PIPELINES = { # HAXX :D
251         GitImportPipeline: 800,
252 }
253
254 # haxx: sad monkeypatch, might break
255 from importlib import import_module
256 def load_object(path):
257         try:
258                 dot = path.rindex('.')
259         except ValueError:
260                 raise ValueError("Error loading object '%s': not a full path" % path)
261         except AttributeError:
262                 return path # hax
263
264         module, name = path[:dot], path[dot+1:]
265         mod = import_module(module)
266
267         try:
268                 obj = getattr(mod, name)
269         except AttributeError:
270                 raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
271
272         return obj
273
274 scrapy.utils.misc.load_object = load_object
275 # end haxx
276
277 from scrapy.exceptions import DontCloseSpider
278 def finished_run():
279         log.msg("""
280 ┌───────────────────────────────────────┐
281 │           finished run                │
282 │                                       │
283 │ VERIFY IT REALLY FOUND ALL YOUR PAGES │
284 │      OR YOU WILL BE SORRY LATER       │
285 │                                       │
286 │ if it was successful, you now want to │
287 │ repack the dumped git object database:│
288 │                                       │
289 │ $ git reflog expire --expire=now --all│
290 │ $ git gc --prune=now                  │
291 │ $ git repack -A -d                    │
292 │ $ git gc --aggressive --prune=now     │
293 └───────────────────────────────────────┘
294         """, spider=spider, level=log.INFO)
295
296
297 if __name__ == "__main__":
298         # for scrapy 0.24
299         from twisted.internet import reactor
300         from scrapy.utils.project import get_project_settings
301         from scrapy.crawler import Crawler
302         from scrapy import log, signals
303
304         import sys
305
306         print("""
307         ┌───────────────────────────────────────┐
308         │        Redmine Exporter script        │
309         ├───────────────────────────────────────┤
310         │  handle with care,                    │
311         │        don't kill your webserver,     │
312         │                             ...enjoy  │
313         └───────────────────────────────────────┘
314         """)
315         raw_input("Hit Enter to continue...")
316
317         spider = RedmineExportSpider()
318         settings = get_project_settings()
319         settings.set('BOT_NAME', BOT_NAME, priority='cmdline')
320         settings.set('USER_AGENT', USER_AGENT, priority='cmdline')
321         settings.set('ITEM_PIPELINES', ITEM_PIPELINES, priority='cmdline')
322         settings.set('CONCURRENT_REQUESTS', CONCURRENT_REQUESTS, priority='cmdline')
323         settings.set('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG, priority='cmdline')
324         crawler = Crawler(settings)
325         crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
326         crawler.signals.connect(finished_run, signal=signals.spider_closed)
327         crawler.configure()
328         crawler.crawl(spider)
329         crawler.start()
330 #       log.start(loglevel=log.DEBUG)
331         log.start(loglevel=log.INFO)
332         log.msg("Starting run ...", spider=spider, level=log.INFO)
333         reactor.run()