Project

General

Profile

« Previous | Next » 

Revision 29471

Added by Eri Katsari over 10 years ago

View differences:

modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/examples/javamapreduce/stats/oozie_app/workflow.xml
1
<workflow-app name="test-core_examples_javamapreduce_stats" xmlns="uri:oozie:workflow:0.4">
2
	<!-- map reduce job that exports hbase data and prepares them for import to the relation database used for statistics generation -->
3

  
4
	<global>
5
		<job-tracker>${jobTracker}</job-tracker>
6
		<name-node>${nameNode}</name-node>
7
		<configuration>
8
			<property>
9
				<name>mapred.job.queue.name</name>
10
				<value>${queueName}</value>
11
			</property>
12
			<property>
13
				<name>oozie.sqoop.log.level</name>
14
				<value>DEBUG</value>
15
			</property>
16
		</configuration>
17
	</global>
18
	<start to='get-scanner' />
19
	<action name='get-scanner'>
20
		<java>
21
			<main-class>eu.dnetlib.iis.core.workflows.stats.HbaseScannerGenerator
22
			</main-class>
23
			<!-- column families: -->
24

  
25
			<arg>
26
				-f datasource 
27
<!-- 				, -->
28
<!-- datasourceOrganization_provision_provides , -->
29
<!-- 				organization, -->
30
<!-- 				project, projectOrganization_participation_hasParticipant -->
31
				<!-- result,resultProject_outcome_produces, -->
32
				<!-- personResult_authorship_hasAuthor,resultResult_publicationDataset_isRelatedTo -->
33
			</arg>
34

  
35
			<capture-output />
36
		</java>
37
		<ok to="mr_export" />
38
		<error to="fail" />
39
	</action>
40
	<action name="mr_export">
41
		<map-reduce>
42

  
43
			<prepare>
44
				<delete path="${nameNode}${Stats_output_Path}" />
45

  
46
			</prepare>
47
			<configuration>
48
				<property>
49
					<name>hbase.mapreduce.scan</name>
50
					<value>${wf:actionData('get-scanner')['scan']}</value>
51
				</property>
52
				<property>
53
					<name>hbase.rootdir</name>
54
					<value>hdfs://nmis-hadoop-cluster/hbase</value>
55
				</property>
56

  
57
				<property>
58
					<name>hbase.security.authentication</name>
59
					<value>simple</value>
60
				</property>
61
				<!-- ZOOKEEPER -->
62

  
63
				<property>
64
					<name>hbase.zookeeper.quorum</name>
65
					<value>
66
						quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
67
					</value>
68
				</property>
69
				<property>
70
					<name>zookeeper.znode.rootserver</name>
71
					<value>root-region-server</value>
72
				</property>
73

  
74
				<property>
75
					<name>hbase.zookeeper.property.clientPort</name>
76
					<value>2182</value>
77
				</property>
78

  
79

  
80
				<!-- MR IO -->
81

  
82

  
83
				<property>
84
					<name>mapreduce.inputformat.class</name>
85
					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
86
				</property>
87

  
88
				<property>
89
					<name>mapred.mapoutput.key.class</name>
90
					<value>org.apache.hadoop.io.Text</value>
91
				</property>
92
				<property>
93
					<name>mapred.mapoutput.value.class</name>
94
					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
95
				</property>
96
				<property>
97
					<name>mapred.output.key.class</name>
98
					<value>org.apache.hadoop.io.Text</value>
99
				</property>
100
				<property>
101
					<name>mapred.output.value.class</name>
102
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
103
					</value>
104
				</property>
105

  
106
				<!-- ## This is required for new MapReduce API usage -->
107
				<property>
108
					<name>mapred.mapper.new-api</name>
109
					<value>true</value>
110
				</property>
111
				<property>
112
					<name>mapred.reducer.new-api</name>
113
					<value>true</value>
114
				</property>
115

  
116
				<!-- # Job-specific options -->
117
				<property>
118
					<name>dfs.blocksize</name>
119
					<value>32M</value>
120
				</property>
121
				<property>
122
					<name>mapred.output.compress</name>
123
					<value>false</value>
124
				</property>
125
				<property>
126
					<name>mapred.reduce.tasks.speculative.execution</name>
127
					<value>false</value>
128
				</property>
129
				<property>
130
					<name>mapred.reduce.tasks.speculative.execution</name>
131
					<value>false</value>
132
				</property>
133
				<property>
134
					<name>mapreduce.map.speculative</name>
135
					<value>false</value>
136
				</property>
137

  
138
				<!-- I/O FORMAT -->
139
				<!-- IMPORTANT: sets default delimeter used by text output writer. Required to fix issue with traling tab added between id and value in multiple outputs -->
140
				<property>
141
					<name>mapred.textoutputformat.separator</name>
142
					<value>${Stats_delim_Character}</value>
143
				</property>
144
				<!-- ## Names of all output ports -->
145

  
146
				<property>
147
					<name>mapreduce.multipleoutputs</name>
148

  
149
					<value>${out1} ${out2} ${out3} ${out4} ${out5} ${out6} ${out7} ${out8} ${out9} ${out10} ${out11} ${out12} ${out13} ${out14} ${out15} ${out16} ${out17} ${out18} ${out19} ${out20}
150
					</value>
151

  
152
				</property>
153
				<!-- datasource -->
154
				<property>
155
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.key</name>
156
					<value>org.apache.hadoop.io.Text</value>
157
				</property>
158
				<property>
159
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.value</name>
160
					<value>org.apache.hadoop.io.Text</value>
161
				</property>
162
				<property>
163
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.format</name>
164
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
165
					</value>
166
				</property>
167
				<!-- datasourceLanguage -->
168
				<property>
169
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.key</name>
170
					<value>org.apache.hadoop.io.Text</value>
171
				</property>
172
				<property>
173
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.value</name>
174
					<value>org.apache.hadoop.io.Text</value>
175
				</property>
176
				<property>
177
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.format</name>
178
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
179
					</value>
180
				</property>
181

  
182

  
183

  
184
				<!-- datasourceOrganization -->
185
				<property>
186
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.key</name>
187
					<value>org.apache.hadoop.io.Text</value>
188
				</property>
189
				<property>
190
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.value</name>
191
					<value>org.apache.hadoop.io.Text</value>
192
				</property>
193
				<property>
194
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.format</name>
195
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
196
					</value>
197
				</property>
198

  
199
				<!-- datasourceTopic -->
200
				<property>
201
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.key</name>
202
					<value>org.apache.hadoop.io.Text</value>
203
				</property>
204
				<property>
205
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.value</name>
206
					<value>org.apache.hadoop.io.Text</value>
207
				</property>
208
				<property>
209
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.format</name>
210
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
211
					</value>
212
				</property>
213

  
214
				<!-- resultDatasource -->
215
				<property>
216
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.key</name>
217
					<value>org.apache.hadoop.io.Text</value>
218
				</property>
219
				<property>
220
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.value</name>
221
					<value>org.apache.hadoop.io.Text</value>
222
				</property>
223
				<property>
224
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.format</name>
225
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
226
					</value>
227
				</property>
228
				<!-- organization -->
229
				<property>
230
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.key</name>
231
					<value>org.apache.hadoop.io.Text</value>
232
				</property>
233
				<property>
234
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.value</name>
235
					<value>org.apache.hadoop.io.Text</value>
236
				</property>
237
				<property>
238
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.format</name>
239
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
240
					</value>
241
				</property>
242

  
243
				<!-- projectOrganization -->
244
				<property>
245
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.key</name>
246
					<value>org.apache.hadoop.io.Text</value>
247
				</property>
248
				<property>
249
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.value</name>
250
					<value>org.apache.hadoop.io.Text</value>
251
				</property>
252
				<property>
253
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.format</name>
254
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
255
					</value>
256
				</property>
257
				<!-- resultProject -->
258
				<property>
259
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.key</name>
260
					<value>org.apache.hadoop.io.Text</value>
261
				</property>
262
				<property>
263
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.value</name>
264
					<value>org.apache.hadoop.io.Text</value>
265
				</property>
266
				<property>
267
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.format</name>
268
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
269
					</value>
270
				</property>
271

  
272
				<!-- project -->
273
				<property>
274
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.key</name>
275
					<value>org.apache.hadoop.io.Text</value>
276
				</property>
277
				<property>
278
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.value</name>
279
					<value>org.apache.hadoop.io.Text</value>
280
				</property>
281
				<property>
282
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.format</name>
283
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
284
					</value>
285
				</property>
286

  
287
				<!-- resultConcept -->
288
				<property>
289
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.key</name>
290
					<value>org.apache.hadoop.io.Text</value>
291
				</property>
292
				<property>
293
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.value</name>
294
					<value>org.apache.hadoop.io.Text</value>
295
				</property>
296
				<property>
297
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.format</name>
298
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
299
					</value>
300
				</property>
301

  
302
				<!-- resultClaim -->
303
				<property>
304
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.key</name>
305
					<value>org.apache.hadoop.io.Text</value>
306
				</property>
307
				<property>
308
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.value</name>
309
					<value>org.apache.hadoop.io.Text</value>
310
				</property>
311
				<property>
312
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.format</name>
313
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
314
					</value>
315
				</property>
316

  
317
				<!-- resultClassification -->
318
				<property>
319
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.key</name>
320
					<value>org.apache.hadoop.io.Text</value>
321
				</property>
322
				<property>
323
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.value</name>
324
					<value>org.apache.hadoop.io.Text</value>
325
				</property>
326
				<property>
327
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.format</name>
328
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
329
					</value>
330
				</property>
331

  
332
				<!-- resultLanguage -->
333
				<property>
334
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.key</name>
335
					<value>org.apache.hadoop.io.Text</value>
336
				</property>
337
				<property>
338
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.value</name>
339
					<value>org.apache.hadoop.io.Text</value>
340
				</property>
341
				<property>
342
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.format</name>
343
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
344
					</value>
345
				</property>
346

  
347
				<!-- resultProject -->
348
				<property>
349
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.key</name>
350
					<value>org.apache.hadoop.io.Text</value>
351
				</property>
352
				<property>
353
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.value</name>
354
					<value>org.apache.hadoop.io.Text</value>
355
				</property>
356
				<property>
357
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.format</name>
358
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
359
					</value>
360
				</property>
361
				<!-- resultResult -->
362
				<property>
363
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.key</name>
364
					<value>org.apache.hadoop.io.Text</value>
365
				</property>
366
				<property>
367
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.value</name>
368
					<value>org.apache.hadoop.io.Text</value>
369
				</property>
370
				<property>
371
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.format</name>
372
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
373
					</value>
374
				</property>
375
				<!-- resultTopic -->
376
				<property>
377
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.key</name>
378
					<value>org.apache.hadoop.io.Text</value>
379
				</property>
380
				<property>
381
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.value</name>
382
					<value>org.apache.hadoop.io.Text</value>
383
				</property>
384
				<property>
385
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.format</name>
386
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
387
					</value>
388
				</property>
389

  
390
				<!-- resultDatasource -->
391
				<property>
392
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.key</name>
393
					<value>org.apache.hadoop.io.Text</value>
394
				</property>
395
				<property>
396
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.value</name>
397
					<value>org.apache.hadoop.io.Text</value>
398
				</property>
399
				<property>
400
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.format</name>
401
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
402
					</value>
403
				</property>
404

  
405

  
406

  
407
				<!-- result -->
408
				<property>
409
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.key</name>
410
					<value>org.apache.hadoop.io.Text</value>
411
				</property>
412
				<property>
413
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.value</name>
414
					<value>org.apache.hadoop.io.Text</value>
415
				</property>
416
				<property>
417
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.format</name>
418
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
419
					</value>
420
				</property>
421

  
422
				<!-- claim -->
423
				<property>
424
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.key</name>
425
					<value>org.apache.hadoop.io.Text</value>
426
				</property>
427
				<property>
428
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.value</name>
429
					<value>org.apache.hadoop.io.Text</value>
430
				</property>
431
				<property>
432
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.format</name>
433
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
434
					</value>
435
				</property>
436
				<!-- cncept -->
437
				<property>
438
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.key</name>
439
					<value>org.apache.hadoop.io.Text</value>
440
				</property>
441
				<property>
442
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.value</name>
443
					<value>org.apache.hadoop.io.Text</value>
444
				</property>
445
				<property>
446
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.format</name>
447
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
448
					</value>
449
				</property>
450

  
451

  
452
				<!-- ## Classes of mapper and reducer -->
453

  
454
				<property>
455
					<name>mapreduce.map.class</name>
456
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsMapper
457
					</value>
458
				</property>
459
				<property>
460
					<name>mapreduce.reduce.class</name>
461
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsReducer
462
					</value>
463
				</property>
464
				<property>
465
					<name>io.serializations</name>
466
					<value>org.apache.hadoop.io.serializer.WritableSerialization
467
					</value>
468
				</property>
469
				<!-- ## Custom config -->
470

  
471
				<!--delim character used to seperate fields in hdfs dump files <property> -->
472
				<property>
473
					<name>stats.delim</name>
474
					<value>${Stats_delim_Character}</value>
475
				</property>
476
				<!--default string for Null String Values -->
477
				<property>
478
					<name>stats.nullString</name>
479
					<value>${Stats_null_String_Field}</value>
480
				</property>
481
				<!--default string for Null Numeric Values -->
482
				<property>
483
					<name>stats.nullNum</name>
484
					<value>${Stats_null_Numeric_Field}</value>
485
				</property>
486
				<property>
487
					<name>stats.enclChar</name>
488
					<value>${Stats_enclosing_Character}</value>
489
				</property>
490

  
491

  
492
				<!--source hbase table -->
493
				<property>
494
					<name>hbase.mapreduce.inputtable</name>
495
					<value>${Stats_Hbase_Source_Table}</value>
496
				</property>
497
				<property>
498
					<!-- mapping of protos entities to tables in the relDB -->
499
					<name>stats.dbTablesMap</name>
500
					<value>${Stats_db_table_map}</value>
501
				</property>
502

  
503
				<!-- This directory does not correspond to a data store. In fact, this directory only contains multiple data stores. It has to be set to the name of the workflow node. -->
504
				<property>
505
					<name>mapred.output.dir</name>
506
					<value>${Stats_output_Path}</value>
507
				</property>
508
				<property>
509
					<name>stats.indexConf</name>
510
					<value>${Stats_indexConf}</value>
511
				</property>
512
				<!-- ## Workflow node parameters -->
513
				<property>
514
					<name>mapred.reduce.tasks</name>
515
					<value>${numReducers}</value>
516
				</property>
517
			</configuration>
518
		</map-reduce>
519
		<ok to="exportContext" />
520
		<error to="fail" />
521
	</action>
522

  
523

  
524
	<action name="exportContext">
525
		<java>
526

  
527
			<prepare>
528
			</prepare>
529
			<configuration>
530
				<property>
531
					<name>mapred.job.queue.name</name>
532
					<value>${queueName}</value>
533
				</property>
534
			</configuration>
535

  
536
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
537
			<arg>-SworkingDir=${workingDir}</arg>
538
			<arg>eu.dnetlib.iis.core.workflows.stats.ExportContextWrapper</arg>
539
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
540
			<arg>-PStats_ContextResourceXML=${ContextResourceXML}</arg>
541

  
542
		</java>
543
		<ok to="prepareDatabase" />
544
		<error to="fail" />
545
	</action>
546

  
547
	<action name="prepareDatabase">
548
		<java>
549

  
550
			<prepare>
551
			</prepare>
552
			<configuration>
553
				<property>
554
					<name>mapred.job.queue.name</name>
555
					<value>${queueName}</value>
556
				</property>
557
			</configuration>
558

  
559
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
560
			<arg>-SworkingDir=${workingDir}</arg>
561
			<arg>eu.dnetlib.iis.core.workflows.stats.DBInitWrapper</arg>
562

  
563
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
564
			<arg>-PStats_db_User=${Stats_db_User}</arg>
565
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
566
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
567

  
568
		</java>
569
		<ok to="sqoopImport" />
570
		<error to="fail" />
571
	</action>
572

  
573
	<action name="sqoopImport">
574
		<java>
575
			<prepare>
576
			</prepare>
577
			<configuration>
578
				<property>
579
					<name>mapred.job.queue.name</name>
580
					<value>${queueName}</value>
581
				</property>
582

  
583
				<property>
584
					<name>oozie.sqoop.log.level</name>
585
					<value>DEBUG</value>
586
				</property>
587

  
588
			</configuration>
589

  
590
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
591
			<arg>-SworkingDir=${workingDir}</arg>
592
			<arg>eu.dnetlib.iis.core.workflows.stats.SqoopWrapper</arg>
593

  
594
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
595
			<arg>-PStats_db_User=${Stats_db_User}</arg>
596
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
597

  
598
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
599
			<arg>-PStats_sqoop_RecsPerStatement=${Stats_sqoop_RecsPerStatement}
600
			</arg>
601
			<arg>-PStats_sqoop_ReducersCount=${Stats_sqoop_ReducersCount}</arg>
602
			<arg>-PStats_sqoop_StatementPerTrans=${Stats_sqoop_StatementPerTrans}
603
			</arg>
604
			<arg>-PStats_delim_Character=${Stats_delim_Character}</arg>
605
			<arg>-PStats_db_table_map=${Stats_db_table_map}</arg>
606
			<arg>-PStats_enclosing_Character=${Stats_enclosing_Character} </arg>
607

  
608
		</java>
609
		<ok to="finalizeDatabase" />
610
		<error to="fail" />
611
	</action>
612

  
613
	<action name="finalizeDatabase">
614
		<java>
615

  
616
			<prepare>
617
			</prepare>
618
			<configuration>
619
				<property>
620
					<name>mapred.job.queue.name</name>
621
					<value>${queueName}</value>
622
				</property>
623
			</configuration>
624

  
625
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
626
			<arg>-SworkingDir=${workingDir}</arg>
627
			<arg>eu.dnetlib.iis.core.workflows.stats.DBFinalizeWrapper</arg>
628
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
629
			<arg>-PStats_db_User=${Stats_db_User}</arg>
630
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
631
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
632
			 
633

  
634
		</java>
635
		<ok to="end" />
636
		<error to="fail" />
637
	</action>
638
	<kill name="fail">
639
		<message>
640
			Unfortunately, the process failed -- error message:
641
			[${wf:errorMessage(wf:lastErrorNode())}]
642
		</message>
643
	</kill>
644
	<end name="end" />
645
</workflow-app>
modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/examples/javamapreduce/stats/job.properties
1
oozieServiceLoc=http://oozie.t.hadoop.research-infrastructures.eu:11000/oozie 
2
nameNode=hdfs://nmis-hadoop-cluster 
3
jobTracker=nmis-hadoop-jt
4
queueName=default 
5
user.name=eri.katsari 
6
numReducers=1   
7
Stats_db_Url = jdbc:postgresql://duffy.di.uoa.gr:5432/test_stats
8
Stats_db_User = sqoop
9
Stats_db_Pass = sqoop
10
Stats_db_Driver = org.postgresql.Driver
11
Stats_sqoop_RecsPerStatement = 1000
12
Stats_sqoop_StatementPerTrans = 1000
13
Stats_sqoop_ReducersCount=4
14
Stats_Hbase_Source_Table=db_openaireplus_node6_t
15
Stats_output_Path=/tmp/test_stats/
16
Stats_null_String_Field=NULL
17
Stats_null_Numeric_Field=-1
18
Stats_delim_Character=!
19
Stats_enclosing_Character=#
20
Stats_db_table_map=datasourceLanguage=datasource_languages,datasource=datasource,project=project,result=result,organization=organization,datasourceOrganization=datasource_organizations,datasourceTopic=datasource_topics,projectOrganization=project_organizations,resultClaim=result_claims,resultClassification=result_classifications,resultConcept=result_concepts,resultLanguage=result_languages,resultOrganization=result_organizations,resultResult=result_results,resultProject=project_results,resultResult=result_results,resultTopic=result_topics,category=category,claim=claim,concept=concept,datasourceLanguage=datasource_languages,resultLanguage=result_languages,resultDatasource=result_datasources
21
out1=datasource 
22
out2=project 
23
out3=organization 
24
out4=datasourceOrganization 
25
out5=datasourceTopic 
26
out6=datasourceLanguage 
27
out7=projectOrganization 
28
out8=resultClaim 
29
out9=resultClassification 
30
out10=resultConcept 
31
out11=resultLanguage 
32
out12=resultOrganization 
33
out13=resultResult 
34
out14=resultProject 
35
out15=resultTopic 
36
out16=resultDatasource 
37
out17=result
38
out18=claim 
39
out19=concept
40
out20=category 
41
Stats_indexConf=index.conf { result { dups = true, links = [{ relType = personResult_authorship_hasAuthor, targetEntity = person, expandAs = rel, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_dedup_isMergedIn, targetEntity = result, expandAs = child, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_dedup_merges, targetEntity = result, expandAs = child, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_publicationDataset_isRelatedTo, targetEntity = result, expandAs = rel, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] },{ relType = resultResult_similarity_isAmongTopNSimilarDocuments, targetEntity = result, expandAs = rel, symmetric = false, fields = [title,dateofacceptance,publisher,resulttype,similarity,type] },{ relType = resultResult_similarity_hasAmongTopNSimilarDocuments, targetEntity = result, expandAs = rel, symmetric = false, fields = [title,dateofacceptance,publisher,resulttype,similarity,type] }]}, person { dups = false, links = [ { relType = personResult_authorship_isAuthorOf, targetEntity = result, expandAs = rel, symmetric = true, fields = [fullname,ranking] }, { relType = projectPerson_contactPerson_isContact, targetEntity = project, expandAs = rel, symmetric = true, fields = [fullname,email,fax,phone] } ]}, datasource { dups = false, links = [ { relType = datasourceOrganization_provision_provides, targetEntity = organization, expandAs = rel, symmetric = true, fields = [officialname,websiteurl,datasourcetype,aggregatortype] } ]},organization { dups = false, links = [{ relType = projectOrganization_participation_isParticipant, targetEntity = project, expandAs = rel, symmetric = true, fields = [legalname,legalshortname,websiteurl,country] }, { relType = datasourceOrganization_provision_isProvidedBy, targetEntity = datasource, expandAs = rel, symmetric = true, fields = [legalname,legalshortname,websiteurl,country] } ]}, project { dups = false, links = [ { relType = projectOrganization_participation_hasParticipant, targetEntity = organization, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] }, { relType = resultProject_outcome_produces, targetEntity = result, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] }, { relType = projectPerson_contactPerson_hasContact, targetEntity = person, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] } ]}}
42
ContextResourceXML=<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="context.xsl"?><RESOURCE_PROFILE><HEADER><RESOURCE_IDENTIFIER value="1284f828-6c12-4905-a9c9-43f143b36e08_Q29udGV4dERTUmVzb3VyY2VzL0NvbnRleHREU1Jlc291cmNlVHlwZQ=="/><RESOURCE_TYPE value="ContextDSResourceType"/><RESOURCE_KIND value="ContextDSResources"/><RESOURCE_URI value=""/><DATE_OF_CREATION value=""/></HEADER><BODY><CONFIGURATION><context id="egi" type="community" label="EGI"><category claim="true" label="Scientific Disciplines" id="egi::classification"><concept label="Natural Sciences" claim="true" id="egi::classification::natsc"><param name="originalID">1</param><concept label="Mathematics" claim="true" id="egi::classification::natsc::math"><param name="originalID">1.1</param><concept label="Applied Mathematics" id="egi::classification::natsc::math::applied" claim="true"><param name="originalID">1.1.1</param></concept></concept></concept></category></context></CONFIGURATION><STATUS/><SECURITY_PARAMETERS/></BODY></RESOURCE_PROFILE>
modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/examples/javamapreduce/stats/NewFile.xml
1
<?xml version="1.0" encoding="UTF-8"?>
modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/javamapreduce/stats/job.properties
1
oozieServiceLoc=http://oozie.t.hadoop.research-infrastructures.eu:11000/oozie 
2
nameNode=hdfs://nmis-hadoop-cluster 
3
jobTracker=nmis-hadoop-jt
4
queueName=default 
5
user.name=eri.katsari 
6
numReducers=1   
7
Stats_db_Url = jdbc:postgresql://duffy.di.uoa.gr:5432/test_stats
8
Stats_db_User = sqoop
9
Stats_db_Pass = sqoop
10
Stats_db_Driver = org.postgresql.Driver
11
Stats_sqoop_RecsPerStatement = 1000
12
Stats_sqoop_StatementPerTrans = 1000
13
Stats_sqoop_ReducersCount=4
14
Stats_Hbase_Source_Table=db_openaireplus_node6_t
15
Stats_output_Path=/tmp/test_stats/
16
Stats_null_String_Field=NULL
17
Stats_null_Numeric_Field=-1
18
Stats_delim_Character=!
19
Stats_enclosing_Character=#
20
Stats_db_table_map=datasourceLanguage=datasource_languages,datasource=datasource,project=project,result=result,organization=organization,datasourceOrganization=datasource_organizations,datasourceTopic=datasource_topics,projectOrganization=project_organizations,resultClaim=result_claims,resultClassification=result_classifications,resultConcept=result_concepts,resultLanguage=result_languages,resultOrganization=result_organizations,resultResult=result_results,resultProject=project_results,resultResult=result_results,resultTopic=result_topics,category=category,claim=claim,concept=concept,datasourceLanguage=datasource_languages,resultLanguage=result_languages,resultDatasource=result_datasources
21
out1=datasource 
22
out2=project 
23
out3=organization 
24
out4=datasourceOrganization 
25
out5=datasourceTopic 
26
out6=datasourceLanguage 
27
out7=projectOrganization 
28
out8=resultClaim 
29
out9=resultClassification 
30
out10=resultConcept 
31
out11=resultLanguage 
32
out12=resultOrganization 
33
out13=resultResult 
34
out14=resultProject 
35
out15=resultTopic 
36
out16=resultDatasource 
37
out17=result
38
out18=claim 
39
out19=concept
40
out20=category 
41
Stats_indexConf=index.conf { result { dups = true, links = [{ relType = personResult_authorship_hasAuthor, targetEntity = person, expandAs = rel, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_dedup_isMergedIn, targetEntity = result, expandAs = child, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_dedup_merges, targetEntity = result, expandAs = child, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] }, { relType = resultResult_publicationDataset_isRelatedTo, targetEntity = result, expandAs = rel, symmetric = true, fields = [title,dateofacceptance,publisher,resulttype] },{ relType = resultResult_similarity_isAmongTopNSimilarDocuments, targetEntity = result, expandAs = rel, symmetric = false, fields = [title,dateofacceptance,publisher,resulttype,similarity,type] },{ relType = resultResult_similarity_hasAmongTopNSimilarDocuments, targetEntity = result, expandAs = rel, symmetric = false, fields = [title,dateofacceptance,publisher,resulttype,similarity,type] }]}, person { dups = false, links = [ { relType = personResult_authorship_isAuthorOf, targetEntity = result, expandAs = rel, symmetric = true, fields = [fullname,ranking] }, { relType = projectPerson_contactPerson_isContact, targetEntity = project, expandAs = rel, symmetric = true, fields = [fullname,email,fax,phone] } ]}, datasource { dups = false, links = [ { relType = datasourceOrganization_provision_provides, targetEntity = organization, expandAs = rel, symmetric = true, fields = [officialname,websiteurl,datasourcetype,aggregatortype] } ]},organization { dups = false, links = [{ relType = projectOrganization_participation_isParticipant, targetEntity = project, expandAs = rel, symmetric = true, fields = [legalname,legalshortname,websiteurl,country] }, { relType = datasourceOrganization_provision_isProvidedBy, targetEntity = datasource, expandAs = rel, symmetric = true, fields = [legalname,legalshortname,websiteurl,country] } ]}, project { dups = false, links = [ { relType = projectOrganization_participation_hasParticipant, targetEntity = organization, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] }, { relType = resultProject_outcome_produces, targetEntity = result, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] }, { relType = projectPerson_contactPerson_hasContact, targetEntity = person, expandAs = rel, symmetric = true, fields = [code,acronym,title,websiteurl,contracttype,fundingtree] } ]}}
42
ContextResourceXML=<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="context.xsl"?><RESOURCE_PROFILES><RESOURCE_PROFILE><HEADER><RESOURCE_IDENTIFIER value="1284f828-6c12-4905-a9c9-43f143b36e08_Q29udGV4dERTUmVzb3VyY2VzL0NvbnRleHREU1Jlc291cmNlVHlwZQ=="/><RESOURCE_TYPE value="ContextDSResourceType"/><RESOURCE_KIND value="ContextDSResources"/><RESOURCE_URI value=""/><DATE_OF_CREATION value=""/></HEADER><BODY><CONFIGURATION><context id="egi" type="community" label="EGI"><category claim="true" label="Scientific Disciplines" id="egi::classification"><concept label="Natural Sciences" claim="true" id="egi::classification::natsc"><param name="originalID">1</param><concept label="Mathematics" claim="true" id="egi::classification::natsc::math"><param name="originalID">1.1</param><concept label="Applied Mathematics" id="egi::classification::natsc::math::applied" claim="true"><param name="originalID">1.1.1</param></concept></concept></concept></category></context></CONFIGURATION><STATUS/><SECURITY_PARAMETERS/></BODY></RESOURCE_PROFILE></RESOURCE_PROFILES>
modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/javamapreduce/stats/oozie_app/workflow.xml
1
<workflow-app name="test-core_examples_javamapreduce_stats" xmlns="uri:oozie:workflow:0.4">
2
	<!-- map reduce job that exports hbase data and prepares them for import to the relation database used for statistics generation -->
3

  
4
	<global>
5
		<job-tracker>${jobTracker}</job-tracker>
6
		<name-node>${nameNode}</name-node>
7
		<configuration>
8
			<property>
9
				<name>mapred.job.queue.name</name>
10
				<value>${queueName}</value>
11
			</property>
12
			<property>
13
				<name>oozie.sqoop.log.level</name>
14
				<value>DEBUG</value>
15
			</property>
16
		</configuration>
17
	</global>
18
	<start to='get-scanner' />
19
	<action name='get-scanner'>
20
		<java>
21
			<main-class>eu.dnetlib.iis.core.workflows.stats.HbaseScannerGenerator
22
			</main-class>
23
			<!-- column families: -->
24

  
25
			<arg>
26
				-f datasource 
27
<!-- 				, -->
28
<!-- datasourceOrganization_provision_provides , -->
29
<!-- 				organization, -->
30
<!-- 				project, projectOrganization_participation_hasParticipant -->
31
				<!-- result,resultProject_outcome_produces, -->
32
				<!-- personResult_authorship_hasAuthor,resultResult_publicationDataset_isRelatedTo -->
33
			</arg>
34

  
35
			<capture-output />
36
		</java>
37
		<ok to="mr_export" />
38
		<error to="fail" />
39
	</action>
40
	<action name="mr_export">
41
		<map-reduce>
42

  
43
			<prepare>
44
				<delete path="${nameNode}${Stats_output_Path}" />
45

  
46
			</prepare>
47
			<configuration>
48
				<property>
49
					<name>hbase.mapreduce.scan</name>
50
					<value>${wf:actionData('get-scanner')['scan']}</value>
51
				</property>
52
				<property>
53
					<name>hbase.rootdir</name>
54
					<value>hdfs://nmis-hadoop-cluster/hbase</value>
55
				</property>
56

  
57
				<property>
58
					<name>hbase.security.authentication</name>
59
					<value>simple</value>
60
				</property>
61
				<!-- ZOOKEEPER -->
62

  
63
				<property>
64
					<name>hbase.zookeeper.quorum</name>
65
					<value>
66
						quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
67
					</value>
68
				</property>
69
				<property>
70
					<name>zookeeper.znode.rootserver</name>
71
					<value>root-region-server</value>
72
				</property>
73

  
74
				<property>
75
					<name>hbase.zookeeper.property.clientPort</name>
76
					<value>2182</value>
77
				</property>
78

  
79

  
80
				<!-- MR IO -->
81

  
82

  
83
				<property>
84
					<name>mapreduce.inputformat.class</name>
85
					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
86
				</property>
87

  
88
				<property>
89
					<name>mapred.mapoutput.key.class</name>
90
					<value>org.apache.hadoop.io.Text</value>
91
				</property>
92
				<property>
93
					<name>mapred.mapoutput.value.class</name>
94
					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
95
				</property>
96
				<property>
97
					<name>mapred.output.key.class</name>
98
					<value>org.apache.hadoop.io.Text</value>
99
				</property>
100
				<property>
101
					<name>mapred.output.value.class</name>
102
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
103
					</value>
104
				</property>
105

  
106
				<!-- ## This is required for new MapReduce API usage -->
107
				<property>
108
					<name>mapred.mapper.new-api</name>
109
					<value>true</value>
110
				</property>
111
				<property>
112
					<name>mapred.reducer.new-api</name>
113
					<value>true</value>
114
				</property>
115

  
116
				<!-- # Job-specific options -->
117
				<property>
118
					<name>dfs.blocksize</name>
119
					<value>32M</value>
120
				</property>
121
				<property>
122
					<name>mapred.output.compress</name>
123
					<value>false</value>
124
				</property>
125
				<property>
126
					<name>mapred.reduce.tasks.speculative.execution</name>
127
					<value>false</value>
128
				</property>
129
				<property>
130
					<name>mapred.reduce.tasks.speculative.execution</name>
131
					<value>false</value>
132
				</property>
133
				<property>
134
					<name>mapreduce.map.speculative</name>
135
					<value>false</value>
136
				</property>
137

  
138
				<!-- I/O FORMAT -->
139
				<!-- IMPORTANT: sets default delimeter used by text output writer. Required to fix issue with traling tab added between id and value in multiple outputs -->
140
				<property>
141
					<name>mapred.textoutputformat.separator</name>
142
					<value>${Stats_delim_Character}</value>
143
				</property>
144
				<!-- ## Names of all output ports -->
145

  
146
				<property>
147
					<name>mapreduce.multipleoutputs</name>
148

  
149
					<value>${out1} ${out2} ${out3} ${out4} ${out5} ${out6} ${out7} ${out8} ${out9} ${out10} ${out11} ${out12} ${out13} ${out14} ${out15} ${out16} ${out17} ${out18} ${out19} ${out20}
150
					</value>
151

  
152
				</property>
153
				<!-- datasource -->
154
				<property>
155
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.key</name>
156
					<value>org.apache.hadoop.io.Text</value>
157
				</property>
158
				<property>
159
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.value</name>
160
					<value>org.apache.hadoop.io.Text</value>
161
				</property>
162
				<property>
163
					<name>mapreduce.multipleoutputs.namedOutput.${out1}.format</name>
164
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
165
					</value>
166
				</property>
167
				<!-- datasourceLanguage -->
168
				<property>
169
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.key</name>
170
					<value>org.apache.hadoop.io.Text</value>
171
				</property>
172
				<property>
173
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.value</name>
174
					<value>org.apache.hadoop.io.Text</value>
175
				</property>
176
				<property>
177
					<name>mapreduce.multipleoutputs.namedOutput.${out2}.format</name>
178
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
179
					</value>
180
				</property>
181

  
182

  
183

  
184
				<!-- datasourceOrganization -->
185
				<property>
186
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.key</name>
187
					<value>org.apache.hadoop.io.Text</value>
188
				</property>
189
				<property>
190
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.value</name>
191
					<value>org.apache.hadoop.io.Text</value>
192
				</property>
193
				<property>
194
					<name>mapreduce.multipleoutputs.namedOutput.${out3}.format</name>
195
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
196
					</value>
197
				</property>
198

  
199
				<!-- datasourceTopic -->
200
				<property>
201
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.key</name>
202
					<value>org.apache.hadoop.io.Text</value>
203
				</property>
204
				<property>
205
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.value</name>
206
					<value>org.apache.hadoop.io.Text</value>
207
				</property>
208
				<property>
209
					<name>mapreduce.multipleoutputs.namedOutput.${out4}.format</name>
210
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
211
					</value>
212
				</property>
213

  
214
				<!-- resultDatasource -->
215
				<property>
216
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.key</name>
217
					<value>org.apache.hadoop.io.Text</value>
218
				</property>
219
				<property>
220
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.value</name>
221
					<value>org.apache.hadoop.io.Text</value>
222
				</property>
223
				<property>
224
					<name>mapreduce.multipleoutputs.namedOutput.${out5}.format</name>
225
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
226
					</value>
227
				</property>
228
				<!-- organization -->
229
				<property>
230
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.key</name>
231
					<value>org.apache.hadoop.io.Text</value>
232
				</property>
233
				<property>
234
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.value</name>
235
					<value>org.apache.hadoop.io.Text</value>
236
				</property>
237
				<property>
238
					<name>mapreduce.multipleoutputs.namedOutput.${out6}.format</name>
239
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
240
					</value>
241
				</property>
242

  
243
				<!-- projectOrganization -->
244
				<property>
245
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.key</name>
246
					<value>org.apache.hadoop.io.Text</value>
247
				</property>
248
				<property>
249
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.value</name>
250
					<value>org.apache.hadoop.io.Text</value>
251
				</property>
252
				<property>
253
					<name>mapreduce.multipleoutputs.namedOutput.${out7}.format</name>
254
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
255
					</value>
256
				</property>
257
				<!-- resultProject -->
258
				<property>
259
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.key</name>
260
					<value>org.apache.hadoop.io.Text</value>
261
				</property>
262
				<property>
263
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.value</name>
264
					<value>org.apache.hadoop.io.Text</value>
265
				</property>
266
				<property>
267
					<name>mapreduce.multipleoutputs.namedOutput.${out8}.format</name>
268
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
269
					</value>
270
				</property>
271

  
272
				<!-- project -->
273
				<property>
274
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.key</name>
275
					<value>org.apache.hadoop.io.Text</value>
276
				</property>
277
				<property>
278
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.value</name>
279
					<value>org.apache.hadoop.io.Text</value>
280
				</property>
281
				<property>
282
					<name>mapreduce.multipleoutputs.namedOutput.${out9}.format</name>
283
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
284
					</value>
285
				</property>
286

  
287
				<!-- resultConcept -->
288
				<property>
289
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.key</name>
290
					<value>org.apache.hadoop.io.Text</value>
291
				</property>
292
				<property>
293
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.value</name>
294
					<value>org.apache.hadoop.io.Text</value>
295
				</property>
296
				<property>
297
					<name>mapreduce.multipleoutputs.namedOutput.${out10}.format</name>
298
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
299
					</value>
300
				</property>
301

  
302
				<!-- resultClaim -->
303
				<property>
304
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.key</name>
305
					<value>org.apache.hadoop.io.Text</value>
306
				</property>
307
				<property>
308
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.value</name>
309
					<value>org.apache.hadoop.io.Text</value>
310
				</property>
311
				<property>
312
					<name>mapreduce.multipleoutputs.namedOutput.${out11}.format</name>
313
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
314
					</value>
315
				</property>
316

  
317
				<!-- resultClassification -->
318
				<property>
319
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.key</name>
320
					<value>org.apache.hadoop.io.Text</value>
321
				</property>
322
				<property>
323
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.value</name>
324
					<value>org.apache.hadoop.io.Text</value>
325
				</property>
326
				<property>
327
					<name>mapreduce.multipleoutputs.namedOutput.${out12}.format</name>
328
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
329
					</value>
330
				</property>
331

  
332
				<!-- resultLanguage -->
333
				<property>
334
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.key</name>
335
					<value>org.apache.hadoop.io.Text</value>
336
				</property>
337
				<property>
338
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.value</name>
339
					<value>org.apache.hadoop.io.Text</value>
340
				</property>
341
				<property>
342
					<name>mapreduce.multipleoutputs.namedOutput.${out13}.format</name>
343
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
344
					</value>
345
				</property>
346

  
347
				<!-- resultProject -->
348
				<property>
349
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.key</name>
350
					<value>org.apache.hadoop.io.Text</value>
351
				</property>
352
				<property>
353
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.value</name>
354
					<value>org.apache.hadoop.io.Text</value>
355
				</property>
356
				<property>
357
					<name>mapreduce.multipleoutputs.namedOutput.${out14}.format</name>
358
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
359
					</value>
360
				</property>
361
				<!-- resultResult -->
362
				<property>
363
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.key</name>
364
					<value>org.apache.hadoop.io.Text</value>
365
				</property>
366
				<property>
367
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.value</name>
368
					<value>org.apache.hadoop.io.Text</value>
369
				</property>
370
				<property>
371
					<name>mapreduce.multipleoutputs.namedOutput.${out15}.format</name>
372
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
373
					</value>
374
				</property>
375
				<!-- resultTopic -->
376
				<property>
377
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.key</name>
378
					<value>org.apache.hadoop.io.Text</value>
379
				</property>
380
				<property>
381
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.value</name>
382
					<value>org.apache.hadoop.io.Text</value>
383
				</property>
384
				<property>
385
					<name>mapreduce.multipleoutputs.namedOutput.${out16}.format</name>
386
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
387
					</value>
388
				</property>
389

  
390
				<!-- resultDatasource -->
391
				<property>
392
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.key</name>
393
					<value>org.apache.hadoop.io.Text</value>
394
				</property>
395
				<property>
396
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.value</name>
397
					<value>org.apache.hadoop.io.Text</value>
398
				</property>
399
				<property>
400
					<name>mapreduce.multipleoutputs.namedOutput.${out17}.format</name>
401
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
402
					</value>
403
				</property>
404

  
405

  
406

  
407
				<!-- result -->
408
				<property>
409
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.key</name>
410
					<value>org.apache.hadoop.io.Text</value>
411
				</property>
412
				<property>
413
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.value</name>
414
					<value>org.apache.hadoop.io.Text</value>
415
				</property>
416
				<property>
417
					<name>mapreduce.multipleoutputs.namedOutput.${out18}.format</name>
418
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
419
					</value>
420
				</property>
421

  
422
				<!-- claim -->
423
				<property>
424
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.key</name>
425
					<value>org.apache.hadoop.io.Text</value>
426
				</property>
427
				<property>
428
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.value</name>
429
					<value>org.apache.hadoop.io.Text</value>
430
				</property>
431
				<property>
432
					<name>mapreduce.multipleoutputs.namedOutput.${out19}.format</name>
433
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
434
					</value>
435
				</property>
436
				<!-- cncept -->
437
				<property>
438
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.key</name>
439
					<value>org.apache.hadoop.io.Text</value>
440
				</property>
441
				<property>
442
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.value</name>
443
					<value>org.apache.hadoop.io.Text</value>
444
				</property>
445
				<property>
446
					<name>mapreduce.multipleoutputs.namedOutput.${out20}.format</name>
447
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
448
					</value>
449
				</property>
450

  
451

  
452
				<!-- ## Classes of mapper and reducer -->
453

  
454
				<property>
455
					<name>mapreduce.map.class</name>
456
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsMapper
457
					</value>
458
				</property>
459
				<property>
460
					<name>mapreduce.reduce.class</name>
461
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsReducer
462
					</value>
463
				</property>
464
				<property>
465
					<name>io.serializations</name>
466
					<value>org.apache.hadoop.io.serializer.WritableSerialization
467
					</value>
468
				</property>
469
				<!-- ## Custom config -->
470

  
471
				<!--delim character used to seperate fields in hdfs dump files <property> -->
472
				<property>
473
					<name>stats.delim</name>
474
					<value>${Stats_delim_Character}</value>
475
				</property>
476
				<!--default string for Null String Values -->
477
				<property>
478
					<name>stats.nullString</name>
479
					<value>${Stats_null_String_Field}</value>
480
				</property>
481
				<!--default string for Null Numeric Values -->
482
				<property>
483
					<name>stats.nullNum</name>
484
					<value>${Stats_null_Numeric_Field}</value>
485
				</property>
486
				<property>
487
					<name>stats.enclChar</name>
488
					<value>${Stats_enclosing_Character}</value>
489
				</property>
490

  
491

  
492
				<!--source hbase table -->
493
				<property>
494
					<name>hbase.mapreduce.inputtable</name>
495
					<value>${Stats_Hbase_Source_Table}</value>
496
				</property>
497
				<property>
498
					<!-- mapping of protos entities to tables in the relDB -->
499
					<name>stats.dbTablesMap</name>
500
					<value>${Stats_db_table_map}</value>
501
				</property>
502

  
503
				<!-- This directory does not correspond to a data store. In fact, this directory only contains multiple data stores. It has to be set to the name of the workflow node. -->
504
				<property>
505
					<name>mapred.output.dir</name>
506
					<value>${Stats_output_Path}</value>
507
				</property>
508
				<property>
509
					<name>stats.indexConf</name>
510
					<value>${Stats_indexConf}</value>
511
				</property>
512
				<!-- ## Workflow node parameters -->
513
				<property>
514
					<name>mapred.reduce.tasks</name>
515
					<value>${numReducers}</value>
516
				</property>
517
			</configuration>
518
		</map-reduce>
519
		<ok to="exportContext" />
520
		<error to="fail" />
521
	</action>
522

  
523

  
524
	<action name="exportContext">
525
		<java>
526

  
527
			<prepare>
528
			</prepare>
529
			<configuration>
530
				<property>
531
					<name>mapred.job.queue.name</name>
532
					<value>${queueName}</value>
533
				</property>
534
			</configuration>
535

  
536
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
537
			<arg>-SworkingDir=${workingDir}</arg>
538
			<arg>eu.dnetlib.iis.core.workflows.stats.ExportContextWrapper</arg>
539
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
540
			<arg>-PStats_ContextResourceXML=${ContextResourceXML}</arg>
541

  
542
		</java>
543
		<ok to="prepareDatabase" />
544
		<error to="fail" />
545
	</action>
546

  
547
	<action name="prepareDatabase">
548
		<java>
549

  
550
			<prepare>
551
			</prepare>
552
			<configuration>
553
				<property>
554
					<name>mapred.job.queue.name</name>
555
					<value>${queueName}</value>
556
				</property>
557
			</configuration>
558

  
559
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
560
			<arg>-SworkingDir=${workingDir}</arg>
561
			<arg>eu.dnetlib.iis.core.workflows.stats.DBInitWrapper</arg>
562

  
563
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
564
			<arg>-PStats_db_User=${Stats_db_User}</arg>
565
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
566
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
567

  
568
		</java>
569
		<ok to="sqoopImport" />
570
		<error to="fail" />
571
	</action>
572

  
573
	<action name="sqoopImport">
574
		<java>
575
			<prepare>
576
			</prepare>
577
			<configuration>
578
				<property>
579
					<name>mapred.job.queue.name</name>
580
					<value>${queueName}</value>
581
				</property>
582

  
583
				<property>
584
					<name>oozie.sqoop.log.level</name>
585
					<value>DEBUG</value>
586
				</property>
587

  
588
			</configuration>
589

  
590
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
591
			<arg>-SworkingDir=${workingDir}</arg>
592
			<arg>eu.dnetlib.iis.core.workflows.stats.SqoopWrapper</arg>
593

  
594
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
595
			<arg>-PStats_db_User=${Stats_db_User}</arg>
596
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
597

  
598
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
599
			<arg>-PStats_sqoop_RecsPerStatement=${Stats_sqoop_RecsPerStatement}
600
			</arg>
601
			<arg>-PStats_sqoop_ReducersCount=${Stats_sqoop_ReducersCount}</arg>
602
			<arg>-PStats_sqoop_StatementPerTrans=${Stats_sqoop_StatementPerTrans}
603
			</arg>
604
			<arg>-PStats_delim_Character=${Stats_delim_Character}</arg>
605
			<arg>-PStats_db_table_map=${Stats_db_table_map}</arg>
606
			<arg>-PStats_enclosing_Character=${Stats_enclosing_Character} </arg>
607

  
608
		</java>
609
		<ok to="finalizeDatabase" />
610
		<error to="fail" />
611
	</action>
612

  
613
	<action name="finalizeDatabase">
614
		<java>
615

  
616
			<prepare>
617
			</prepare>
618
			<configuration>
619
				<property>
620
					<name>mapred.job.queue.name</name>
621
					<value>${queueName}</value>
622
				</property>
623
			</configuration>
624

  
625
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
626
			<arg>-SworkingDir=${workingDir}</arg>
627
			<arg>eu.dnetlib.iis.core.workflows.stats.DBFinalizeWrapper</arg>
628
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
629
			<arg>-PStats_db_User=${Stats_db_User}</arg>
630
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
631
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
632
			 
633

  
634
		</java>
635
		<ok to="end" />
636
		<error to="fail" />
637
	</action>
638
	<kill name="fail">
639
		<message>
640
			Unfortunately, the process failed -- error message:
641
			[${wf:errorMessage(wf:lastErrorNode())}]
642
		</message>
643
	</kill>
644
	<end name="end" />
645
</workflow-app>
modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/examples/simpleTest/oozie_app/workflow.xml
1
<workflow-app name="test-core_examples_javamapreduce_stats" xmlns="uri:oozie:workflow:0.4">
2
	<!-- map reduce job that exports hbase data and prepares them for import to the relation database used for statistics generation -->
3

  
1
<workflow-app name="test " xmlns="uri:oozie:workflow:0.4">
2
	 
4 3
	<global>
5 4
		<job-tracker>${jobTracker}</job-tracker>
6 5
		<name-node>${nameNode}</name-node>
......
28 27
 
29 28
    </java>
30 29
  </action>
31
	
32
	
33
	
34
	<action name='get-scanner'>
35
		<java>
36
			<main-class>eu.dnetlib.iis.core.workflows.stats.HbaseScannerGenerator
37
			</main-class>
38
			<!-- column families: -->
39

  
40
			<arg>
41
				-f datasource
42
				<!-- , datasourceOrganization_provision_provides ,organization, -->
43
				<!-- projectOrganization_participation_isParticipant, -->
44
				<!-- project -->
45
				<!-- ,projectOrganization_participation_hasParticipant -->
46
				<!-- , -->
47
<!-- 				result -->
48
				<!-- , resultProject_outcome_produces, -->
49
				<!-- personResult_authorship_hasAuthor,resultResult_publicationDataset_isRelatedTo -->
50
			</arg>
51

  
52
			<capture-output />
53
		</java>
54
		<ok to="mr_export" />
55
		<error to="fail" />
56
	</action>
57
	<action name="mr_export">
58
		<map-reduce>
59

  
60
			<prepare>
61
				<delete path="${nameNode}${Stats_output_Path}" />
62

  
63
			</prepare>
64
			<configuration>
65
				<property>
66
					<name>hbase.mapreduce.scan</name>
67
					<value>${wf:actionData('get-scanner')['scan']}</value>
68
				</property>
69
				<property>
70
					<name>hbase.rootdir</name>
71
					<value>hdfs://nmis-hadoop-cluster/hbase</value>
72
				</property>
73

  
74
				<property>
75
					<name>hbase.security.authentication</name>
76
					<value>simple</value>
77
				</property>
78
				<!-- ZOOKEEPER -->
79

  
80
				<property>
81
					<name>hbase.zookeeper.quorum</name>
82
					<value>
83
						quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
84
					</value>
85
				</property>
86
				<property>
87
					<name>zookeeper.znode.rootserver</name>
88
					<value>root-region-server</value>
89
				</property>
90

  
91
				<property>
92
					<name>hbase.zookeeper.property.clientPort</name>
93
					<value>2182</value>
94
				</property>
95

  
96

  
97
				<!-- MR IO -->
98

  
99

  
100
				<property>
101
					<name>mapreduce.inputformat.class</name>
102
					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
103
				</property>
104

  
105
				<property>
106
					<name>mapred.mapoutput.key.class</name>
107
					<value>org.apache.hadoop.io.Text</value>
108
				</property>
109
				<property>
110
					<name>mapred.mapoutput.value.class</name>
111
					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff