2 # -*- coding: utf-8 -*-
4 # Copyright: © 2014 "nyov"
7 # This script will crawl a Redmine wiki website and write all the history
8 # of all pages found to a single branch inside a Git repository.
10 # The script will create a git repository in your working directory.
11 # It requires the scrapy (0.24) and pygit2 python packages.
12 # Aside from that it needs enough memory to hold all the records in
13 # memory until it can sort them by date and version and flush the
14 # git tree history in correct order to disk only at the very end.
16 # Created for importing from static html pages of a redmine wiki,
17 # (so some workarounds exist, for missing pages, in how the crawl runs)
18 # but should work on or easily be adaptable to the real thing.
21 from scrapy import log
22 from scrapy.contrib.linkextractors import LinkExtractor
23 from scrapy.http import Request, HtmlResponse
24 from scrapy.selector import Selector
31 #from dateutil.parser import parse
43 BOT_NAME = 'RedmineExporter'
45 # how to identify to the target website
46 USER_AGENT = '%s/%s (+http://www.yourdomain.com)' % (BOT_NAME, BOT_VERSION)
47 # how many parallel connections to keep open to the target website
48 CONCURRENT_REQUESTS = 16
50 # show duplicate (dropped) requests
51 DUPEFILTER_DEBUG = False
52 # for debugging log level see end of file
55 def read_git_authors(file):
56 """Read a git (git-svn) authors.txt file
58 which has the line format:
59 handle = Full Name <and@some.email>
65 data = (l for l in data if not l.startswith('#'))
66 for line in data: # if not line.startswith('#'):
67 name, handle = line.strip().split(' = ')
68 author, email = handle.rstrip('>').split(' <')
69 authors[name] = (author, email)
70 #print('\t%s => "%s" [%s]' % (name, author, email))
75 class RedmineUser(scrapy.Item):
76 author = scrapy.Field()
77 email = scrapy.Field()
80 class RedminePage(scrapy.Item):
81 pagename = scrapy.Field()
82 version = scrapy.Field()
83 lastversion = scrapy.Field()
84 updated = scrapy.Field()
86 comment = scrapy.Field()
87 content = scrapy.Field()
92 class RedmineExportSpider(scrapy.Spider):
93 """Xonotic Redmine exporter"""
96 allowed_domains = ['dev.xonotic.org']
98 # wiki's 'Index by title' page
99 'http://dev.xonotic.org/projects/xonotic/wiki/index.html',
100 # this page does not appear in the overview, wtf! I don't even...
101 # oh, it's been renamed
102 'http://dev.xonotic.org/projects/xonotic/wiki/IRC.html',
105 def start_requests(self):
106 for link in self.start_urls[:1]: # index
107 yield Request(url=link, callback=self.parse_index)
108 for link in self.start_urls[1:]: # any other links
109 yield Request(url=link, callback=self.parse_pages)
111 def parse_index(self, response):
112 l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]'))
113 for link in l.extract_links(response):
114 yield Request(link.url, callback=self.parse_pages)
116 def parse_pages(self, response):
117 url, = response.xpath('//div[@id="wrapper"]//div[@id="content"]//a[contains(@class, "icon-history")]/@href').extract()[:1] or [None]
118 return Request(urlparse.urljoin(response.url, url), callback=self.parse_history_entry)
120 def parse_history_entry(self, response):
121 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
122 paginated, = page.xpath('.//span[@class="pagination"]/a[contains(text(), "Next")]/@href').extract()[:1] or [None]
124 # re-entry, missing pages workaround
125 full, = page.xpath('.//span[@class="pagination"]/a[last()]/@href').extract()
126 return Request(urlparse.urljoin(response.url, full), callback=self.parse_history)
127 # missing recursion for more pages (200+ revisions)
129 return self.parse_history(response)
131 def parse_history(self, response):
132 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
133 history = page.xpath('.//form//table/tbody/tr')
134 pagename = re.match(r'.*/wiki/(.*)/history', response.url).group(1)
135 lastversion = page.xpath('.//form//table/tbody/tr[1]/td[1]/a/text()').extract()[0]
138 i['pagename'] = pagename
139 i['version'], = row.xpath('td[@class="id"]/a/text()').extract()[:1] or [None]
140 i['version'] = int(i['version'])
141 i['lastversion'] = int(lastversion)
142 date, = row.xpath('td[@class="updated_on"]/text()').extract()
143 # date parse, assume UTC
144 #i['updated'] = parse(date)
145 i['updated'] = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M %p")
146 i['user'], = row.xpath('td[@class="author"]/a[contains(@class, "user")]/text()').extract()[:1] or [None]
147 userpage, = row.xpath('td[@class="author"]/a[contains(@class, "user")]/@href').extract()[:1] or [None]
148 if userpage is not None:
149 yield Request(urlparse.urljoin(response.url, userpage), callback=self.parse_user)
150 i['comment'], = row.xpath('td[@class="comments"]/text()').extract()[:1] or [None]
151 content, = row.xpath('td[@class="buttons"]//a[contains(@href, "annotate.html")]/@href').extract()[:1] or [None]
152 request = Request(urlparse.urljoin(response.url, content), callback=self.parse_page)
153 request.meta['item'] = i
156 def parse_user(self, response):
158 user = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
159 i['author'], = user.xpath('h2/text()').extract()[:1] or [None]
160 i['author'] = i['author'].strip()
161 #i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/a[contains(@href, "mailto")]/text()').extract()[:1] or [None]
162 i['email'], = user.xpath('div[@class="splitcontentleft"]/ul[1]/li/script/text()').re(r'.*\'(.*)\'')[:1] or [None]
164 i['email'] = '%s@' % i['author']
166 email = urllib.unquote(i['email']).lstrip('document.write(\'').rstrip('\');').decode('string_escape').replace('\\/', '/')
167 fake = Selector(HtmlResponse(response.url, encoding='utf-8', body=email))
168 i['email'], = fake.xpath('//a/text()').extract()[:1] or [None]
171 def parse_page(self, response):
172 i = response.meta['item']
173 page = response.xpath('//div[@id="wrapper"]//div[@id="content"]')
174 lines = page.xpath('table[contains(@class, "filecontent")]//tr/td[@class="line-code"]') # keep empty lines!
175 i['url'] = response.url
178 line = (line.xpath('pre/text()').extract() or [u''])[0]
179 i['content'] += line + '\n'
185 class GitImportPipeline(object):
188 def __init__(self, *a, **kw):
189 self.repo = pygit2.init_repository('wiki.git', False) # non-bare repo
190 self.heap = [] # heap for sorting commits
191 self.committer = pygit2.Signature('RedmineExport', 'redmineexport@dev.xonotic.org', encoding='utf-8')
194 def open_spider(self, spider):
195 self.users = read_git_authors("redmine-authors.txt")
197 def close_spider(self, spider):
198 self.write_git(spider)
200 def process_item(self, i, spider):
201 if isinstance(i, RedmineUser):
202 # prefer pre-loaded identities from local file
203 if i['author'] not in self.users:
204 self.users[i['author']] = (i['author'], i['email'])
205 log.msg("Scraped user %s" % (i['author'],), spider=spider, level=log.INFO)
207 if isinstance(i, RedminePage):
208 oid = self.repo.create_blob(i['content'].encode("utf8"))
209 ts = calendar.timegm(i['updated'].utctimetuple()) # datetime to unix timestamp for sorting
210 heapq.heappush(self.heap, (ts, i['version'], oid, i))
211 log.msg('Scraped page "%s" @ %s' % (i['pagename'], i['version']), spider=spider, level=log.INFO)
215 def write_git(self, spider):
216 parent = parent_id = None
217 for _ in range(len(self.heap)):
218 (ts, vsn, oid, i) = heapq.heappop(self.heap)
220 commit_comment = i['comment'] or u''
221 add_comment = u'\n\n(Commit created by redmine exporter script from page "%s" version %s)' % (i['pagename'], i['version'])
224 tb = self.repo.TreeBuilder(parent.tree) # treeish ~= filesystem folder
226 tb = self.repo.TreeBuilder()
228 filename = '%s%s' % (i['pagename'], '.textile')
230 tb.insert(filename, oid, pygit2.GIT_FILEMODE_BLOB)
231 tree = tb.write() # create updated treeish with current page blob added
234 if parent is not None:
235 parents = [parent_id]
237 (user, email) = self.users[i['user']]
238 author = pygit2.Signature(user, email, time=ts, offset=0, encoding='utf-8')
240 log.msg("Committing %s @ %s (%s)" % (i['pagename'], i['version'], oid), spider=spider, level=log.INFO)
241 cid = self.repo.create_commit(
243 author, self.committer, commit_comment + add_comment, tree, parents, 'utf-8'
245 # commit is new parent for next commit
246 parent = self.repo.get(cid)
250 ITEM_PIPELINES = { # HAXX :D
251 GitImportPipeline: 800,
254 # haxx: sad monkeypatch, might break
255 from importlib import import_module
256 def load_object(path):
258 dot = path.rindex('.')
260 raise ValueError("Error loading object '%s': not a full path" % path)
261 except AttributeError:
264 module, name = path[:dot], path[dot+1:]
265 mod = import_module(module)
268 obj = getattr(mod, name)
269 except AttributeError:
270 raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
274 scrapy.utils.misc.load_object = load_object
277 from scrapy.exceptions import DontCloseSpider
280 ┌───────────────────────────────────────┐
283 │ VERIFY IT REALLY FOUND ALL YOUR PAGES │
284 │ OR YOU WILL BE SORRY LATER │
286 │ if it was successful, you now want to │
287 │ repack the dumped git object database:│
289 │ $ git reflog expire --expire=now --all│
290 │ $ git gc --prune=now │
291 │ $ git repack -A -d │
292 │ $ git gc --aggressive --prune=now │
293 └───────────────────────────────────────┘
294 """, spider=spider, level=log.INFO)
297 if __name__ == "__main__":
299 from twisted.internet import reactor
300 from scrapy.utils.project import get_project_settings
301 from scrapy.crawler import Crawler
302 from scrapy import log, signals
307 ┌───────────────────────────────────────┐
308 │ Redmine Exporter script │
309 ├───────────────────────────────────────┤
310 │ handle with care, │
311 │ don't kill your webserver, │
313 └───────────────────────────────────────┘
315 raw_input("Hit Enter to continue...")
317 spider = RedmineExportSpider()
318 settings = get_project_settings()
319 settings.set('BOT_NAME', BOT_NAME, priority='cmdline')
320 settings.set('USER_AGENT', USER_AGENT, priority='cmdline')
321 settings.set('ITEM_PIPELINES', ITEM_PIPELINES, priority='cmdline')
322 settings.set('CONCURRENT_REQUESTS', CONCURRENT_REQUESTS, priority='cmdline')
323 settings.set('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG, priority='cmdline')
324 crawler = Crawler(settings)
325 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
326 crawler.signals.connect(finished_run, signal=signals.spider_closed)
328 crawler.crawl(spider)
330 # log.start(loglevel=log.DEBUG)
331 log.start(loglevel=log.INFO)
332 log.msg("Starting run ...", spider=spider, level=log.INFO)