Revision 36372
Added by Claudio Atzori over 9 years ago
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/actionmanager/xslt/datacite2updateActions.xslt | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" |
|
3 |
xmlns:dnet="eu.dnetlib.actionmanager.actions.infopackage.DataciteInfoPackageToHbaseXsltFunctions" |
|
4 |
xmlns:oaf="http://namespace.openaire.eu/oaf" |
|
5 |
xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
6 |
xmlns:date="eu.dnetlib.miscutils.datetime.DateUtils" |
|
7 |
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:exslt="http://exslt.org/common" |
|
8 |
extension-element-prefixes="exslt" exclude-result-prefixes="xsl dnet exslt oaf dr dri date"> |
|
9 |
|
|
10 |
<xsl:output omit-xml-declaration="yes" indent="yes"/> |
|
11 |
|
|
12 |
<xsl:param name="trust" select="string('0.9')"/> |
|
13 |
<xsl:param name="provenance" select="string('UNKNOWN')"/> |
|
14 |
<xsl:param name="namespaceprefix" select="string('datacite____')"/> |
|
15 |
|
|
16 |
<xsl:template match="/*"> |
|
17 |
<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/> |
|
18 |
<xsl:variable name="metadata" select="exslt:node-set(/*[local-name()='record']/*[local-name()='metadata']/*[local-name()='resource'])"/> |
|
19 |
<xsl:variable name="rightNSPrefix"> |
|
20 |
<xsl:choose> |
|
21 |
<xsl:when test="not($namespaceprefix)"> |
|
22 |
<xsl:value-of select="//oaf:datasourceprefix"/> |
|
23 |
</xsl:when> |
|
24 |
<xsl:otherwise> |
|
25 |
<xsl:value-of select="$namespaceprefix"/> |
|
26 |
</xsl:otherwise> |
|
27 |
</xsl:choose> |
|
28 |
</xsl:variable> |
|
29 |
|
|
30 |
<xsl:choose> |
|
31 |
<xsl:when test="count($metadata) = 0"> |
|
32 |
<ACTIONS/> |
|
33 |
</xsl:when> |
|
34 |
<xsl:otherwise> |
|
35 |
<xsl:variable name="originalId" select="//*[local-name() = 'identifier' and ./@identifierType='DOI']"/> |
|
36 |
<xsl:variable name="resultId" select="dnet:oafSimpleId('result', //dri:objIdentifier)"/> |
|
37 |
|
|
38 |
<xsl:variable name="creators" select="//*[local-name() = 'creator']"/> |
|
39 |
<xsl:variable name="titles" select="//*[local-name() = 'title']"/> |
|
40 |
<xsl:variable name="subjects" select="//*[local-name() = 'subject']"/> |
|
41 |
<xsl:variable name="publisher" select="//*[local-name() = 'publisher']"/> |
|
42 |
<xsl:variable name="descriptions" select="//*[local-name() = 'description']"/> |
|
43 |
<xsl:variable name="dates" select="//*[local-name() = 'date']"/> |
|
44 |
<xsl:variable name="dateaccepted" select="//oaf:dateAccepted" /> |
|
45 |
<xsl:variable name="resourceType" select="//*[local-name() = 'resourceType']"/> |
|
46 |
<xsl:variable name="formats" select="//*[local-name() = 'format']"/> |
|
47 |
<xsl:variable name="sizes" select="//*[local-name() = 'size']"/> |
|
48 |
<xsl:variable name="rights" select="//oaf:accessrights" /> |
|
49 |
<xsl:variable name="version" select="//*[local-name() = 'version']"/> |
|
50 |
<xsl:variable name="instanceURI" |
|
51 |
select="concat('http://dx.doi.org','/',//*[local-name() = 'resource']/*[local-name() = 'identifier'])"/> |
|
52 |
<xsl:variable name="hostedbyid" |
|
53 |
select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)"/> |
|
54 |
<xsl:variable name="hostedbyname" select="//oaf:hostedBy/@name"/> |
|
55 |
<xsl:variable name="collectedfromid" |
|
56 |
select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)"/> |
|
57 |
<xsl:variable name="collectedfromname" select="//oaf:collectedFrom/@name"/> |
|
58 |
<xsl:variable name="dateOfCollection" select="//dr:dateOfCollection"/> |
|
59 |
<xsl:variable name="language" select="//oaf:language" /> |
|
60 |
<xsl:variable name="cobjcategory" select="//dr:CobjCategory" /> |
|
61 |
|
|
62 |
<ACTIONS> |
|
63 |
<ACTION targetKey="{$resultId}" targetColumnFamily="result" targetColumn="{concat('update_', date:now())}"> |
|
64 |
<xsl:value-of |
|
65 |
select="dnet:oafDataCiteResultFromInfoPackage($resultId, $dataInfo, $metadata, $titles, |
|
66 |
$subjects, $publisher, $descriptions, $dates, $dateaccepted, $resourceType, |
|
67 |
$formats, $sizes, $language, $cobjcategory, $rights, $version, $provenance, $trust, $hostedbyid, $hostedbyname, |
|
68 |
$collectedfromid, $collectedfromname, $originalId, $instanceURI, $dateOfCollection)" |
|
69 |
/> |
|
70 |
</ACTION> |
|
71 |
</ACTIONS> |
|
72 |
</xsl:otherwise> |
|
73 |
</xsl:choose> |
|
74 |
</xsl:template> |
|
75 |
</xsl:stylesheet> |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/actionmanager/xslt/dmf2updateActions.xslt | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<xsl:stylesheet version="1.0" |
|
3 |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/elements/1.1/" |
|
4 |
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
5 |
xmlns:oaa="http://namespace.openaire.eu/oaa" xmlns:oaf="http://namespace.openaire.eu/oaf" |
|
6 |
xmlns:dnet="eu.dnetlib.actionmanager.actions.infopackage.DMFInfoPackageToHbaseXsltFunctions" |
|
7 |
xmlns:date="eu.dnetlib.miscutils.datetime.DateUtils" |
|
8 |
xmlns:exslt="http://exslt.org/common" xmlns:action="http://namespace.openaire.eu/action" |
|
9 |
extension-element-prefixes="exslt" |
|
10 |
exclude-result-prefixes="xsl dc dr dri oaa oaf dnet exslt date"> |
|
11 |
|
|
12 |
<xsl:output omit-xml-declaration="yes" indent="yes" /> |
|
13 |
|
|
14 |
<xsl:param name="trust" select="string('0.9')" /> |
|
15 |
<xsl:param name="provenance" select="string('UNKNOWN')" /> |
|
16 |
<xsl:param name="namespaceprefix" select="string('unknown_____')" /> |
|
17 |
|
|
18 |
<xsl:template match="/*"> |
|
19 |
<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/> |
|
20 |
<xsl:variable name="dateofcollection" select="//dr:dateOfCollection"/> |
|
21 |
|
|
22 |
<xsl:variable name="metadata" select="exslt:node-set(//*[local-name()='metadata']/*)" /> |
|
23 |
<xsl:variable name="collectedDatasourceid"> |
|
24 |
<xsl:choose> |
|
25 |
<xsl:when test="string-length(//oaf:collectedDatasourceid) > 0"> |
|
26 |
<xsl:value-of select="//oaf:collectedDatasourceid" /> |
|
27 |
</xsl:when> |
|
28 |
<xsl:otherwise> |
|
29 |
<xsl:value-of select="UNKNOWN" /> |
|
30 |
</xsl:otherwise> |
|
31 |
</xsl:choose> |
|
32 |
</xsl:variable> |
|
33 |
|
|
34 |
<xsl:choose> |
|
35 |
<xsl:when test="count($metadata) = 0"> |
|
36 |
<ACTIONS /> |
|
37 |
</xsl:when> |
|
38 |
<xsl:otherwise> |
|
39 |
|
|
40 |
<xsl:variable name="objidentifier" select="/record/*[local-name() = 'header']/*[local-name() = 'objIdentifier']" /> |
|
41 |
|
|
42 |
<xsl:variable name="resultId" select="dnet:oafSimpleId('result', $objidentifier)" /> |
|
43 |
|
|
44 |
<xsl:variable name="hostedbyid" select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)" /> |
|
45 |
<xsl:variable name="hostedbyname" select="//oaf:hostedBy/@name" /> |
|
46 |
|
|
47 |
<xsl:variable name="collectedfromid" select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)" /> |
|
48 |
<xsl:variable name="collectedfromname" select="//oaf:collectedFrom/@name" /> |
|
49 |
|
|
50 |
<!-- <xsl:variable name="country" select="substring(//dr:repositoryCountry, 1, 200)" /> --> |
|
51 |
<!-- <xsl:variable name="accessmode" select="substring(//oaf:accessrights, 1, 200)" /> --> |
|
52 |
|
|
53 |
<xsl:variable name="result" select="dnet:oafResultFromInfoPackage($resultId, $dataInfo, $provenance, $trust, $hostedbyid, $hostedbyname, $collectedfromid, $collectedfromname, $objidentifier, $dateofcollection, $metadata)" /> |
|
54 |
|
|
55 |
<ACTIONS> |
|
56 |
<ACTION targetKey="{$resultId}" targetColumnFamily="result" targetColumn="{concat('update_', date:now())}"> |
|
57 |
<xsl:value-of select="$result" /> |
|
58 |
</ACTION> |
|
59 |
</ACTIONS> |
|
60 |
</xsl:otherwise> |
|
61 |
</xsl:choose> |
|
62 |
</xsl:template> |
|
63 |
|
|
64 |
</xsl:stylesheet> |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Test; |
|
5 |
|
|
6 |
import eu.dnetlib.pace.AbstractProtoPaceTest; |
|
7 |
import eu.dnetlib.pace.config.Config; |
|
8 |
import eu.dnetlib.pace.config.Type; |
|
9 |
import eu.dnetlib.pace.model.FieldListImpl; |
|
10 |
import eu.dnetlib.pace.model.FieldValueImpl; |
|
11 |
import eu.dnetlib.pace.model.MapDocument; |
|
12 |
|
|
13 |
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest { |
|
14 |
|
|
15 |
private Config config; |
|
16 |
|
|
17 |
@Before |
|
18 |
public void setUp() { |
|
19 |
config = getResultFullConf(); |
|
20 |
} |
|
21 |
|
|
22 |
@Test |
|
23 |
public void testCombine() { |
|
24 |
MapDocument result = result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013"); |
|
25 |
FieldListImpl fl = new FieldListImpl(); |
|
26 |
fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline")); |
|
27 |
|
|
28 |
result.getFieldMap().put("desc", fl); |
|
29 |
|
|
30 |
fl.clear(); |
|
31 |
fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty")); |
|
32 |
result.getFieldMap().get("title").add(fl); |
|
33 |
|
|
34 |
System.out.println(BlacklistAwareClusteringCombiner.filterAndCombine(result, config, config.blacklists())); |
|
35 |
} |
|
36 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/mapreduce/util/OafDecoderTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.util; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
|
|
6 |
import java.util.List; |
|
7 |
|
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
import eu.dnetlib.data.proto.KindProtos.Kind; |
|
11 |
import eu.dnetlib.miscutils.functional.xml.IndentXmlString; |
|
12 |
|
|
13 |
public class OafDecoderTest { |
|
14 |
|
|
15 |
@Test |
|
16 |
public void testAsXml() { |
|
17 |
|
|
18 |
final OafDecoder decoder = OafTest.embed(OafTest.getResult("50|id_1"), Kind.entity); |
|
19 |
|
|
20 |
assertNotNull(decoder); |
|
21 |
|
|
22 |
assertNotNull(decoder.asXml()); |
|
23 |
|
|
24 |
System.out.println(IndentXmlString.apply(decoder.asXml())); |
|
25 |
|
|
26 |
} |
|
27 |
|
|
28 |
@Test |
|
29 |
public void testGetFieldValues() { |
|
30 |
final OafDecoder decoder = OafTest.embed(OafTest.getResult("50|id_1"), Kind.entity); |
|
31 |
|
|
32 |
final String path = "result/metadata/title/value"; |
|
33 |
final List<String> titles = decoder.decodeEntity().getFieldValues("title", path); |
|
34 |
|
|
35 |
assertNotNull(titles); |
|
36 |
assertFalse(titles.isEmpty()); |
|
37 |
} |
|
38 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/data/transform/datacite_2_hbase.xsl | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<xsl:stylesheet version="1.0" |
|
3 |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/elements/1.1/" |
|
4 |
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:dri="http://www.driver-repository.eu/namespace/dri" |
|
5 |
xmlns:oaa="http://namespace.openaire.eu/oaa" xmlns:oaf="http://namespace.openaire.eu/oaf" |
|
6 |
xmlns:dnet="eu.dnetlib.data.transform.xml.DNetMdStoreDataCiteToHbaseXsltFunctions" |
|
7 |
xmlns:exslt="http://exslt.org/common" extension-element-prefixes="exslt" |
|
8 |
exclude-result-prefixes="xsl dc dr dri oaa oaf dnet exslt"> |
|
9 |
|
|
10 |
<xsl:output omit-xml-declaration="yes" indent="yes" /> |
|
11 |
<xsl:template match="/*"> |
|
12 |
<xsl:variable name="dataInfo" select="/*[local-name() = 'record']/*[local-name() = 'about']/*[local-name() = 'datainfo']"/> |
|
13 |
<xsl:variable name="dateofcollection" select="//dri:dateOfCollection" /> |
|
14 |
<xsl:variable name="metadata" |
|
15 |
select="exslt:node-set(//*[local-name()='metadata']/*)" /> |
|
16 |
<xsl:variable name="namespaceprefix"> |
|
17 |
<xsl:choose> |
|
18 |
<!-- TODO check namespaceprefix length is 12 --> |
|
19 |
<xsl:when test="string-length(//oaf:datasourceprefix) > 0"> |
|
20 |
<xsl:value-of select="//oaf:datasourceprefix" /> |
|
21 |
</xsl:when> |
|
22 |
<xsl:otherwise> |
|
23 |
<xsl:value-of select="unknown_____" /> |
|
24 |
</xsl:otherwise> |
|
25 |
</xsl:choose> |
|
26 |
</xsl:variable> |
|
27 |
|
|
28 |
<xsl:choose> |
|
29 |
<xsl:when |
|
30 |
test="count($metadata) = 0 or normalize-space(//oaf:skipRecord)= 'true'"> |
|
31 |
<ROWS /> |
|
32 |
</xsl:when> |
|
33 |
<xsl:otherwise> |
|
34 |
|
|
35 |
<xsl:variable name="resultId" |
|
36 |
select="dnet:oafSimpleId('result', //dri:objIdentifier)" /> |
|
37 |
|
|
38 |
<xsl:if test="string-length($resultId) > 0"> |
|
39 |
<xsl:variable name="originalid" |
|
40 |
select="concat('', //*[local-name() = 'resource']/*[local-name()='identifier'])" /> |
|
41 |
<xsl:variable name="creators" select="//*[local-name() = 'creator']" /> |
|
42 |
<xsl:variable name="titles" select="//*[local-name() = 'title']" /> |
|
43 |
<xsl:variable name="subjects" select="//*[local-name() = 'subject']" /> |
|
44 |
<xsl:variable name="publisher" select="//*[local-name() = 'publisher']" /> |
|
45 |
<xsl:variable name="descriptions" select="//*[local-name() = 'description']" /> |
|
46 |
<xsl:variable name="dates" select="//*[local-name() = 'date']" /> |
|
47 |
<xsl:variable name="dateaccepted" select="//oaf:dateAccepted" /> |
|
48 |
<xsl:variable name="resourceType" |
|
49 |
select="//*[local-name() = 'resourceType']" /> |
|
50 |
<xsl:variable name="formats" select="//*[local-name() = 'format']" /> |
|
51 |
<xsl:variable name="sizes" select="//*[local-name() = 'size']" /> |
|
52 |
<xsl:variable name="rights" select="//oaf:accessrights" /> |
|
53 |
<xsl:variable name="version" select="//*[local-name() = 'version']" /> |
|
54 |
<xsl:variable name="language" select="//oaf:language" /> |
|
55 |
<xsl:variable name="cobjcategory" select="//dr:CobjCategory" /> |
|
56 |
|
|
57 |
<xsl:variable name="instanceURI"> |
|
58 |
<xsl:choose> |
|
59 |
<xsl:when |
|
60 |
test="string-length( //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='DOI']) > 0"> |
|
61 |
<xsl:value-of |
|
62 |
select="concat('http://dx.doi.org','/', //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='DOI']/text())" /> |
|
63 |
</xsl:when> |
|
64 |
<xsl:otherwise> |
|
65 |
<xsl:value-of |
|
66 |
select="concat('', //*[local-name() = 'resource']/*[local-name()='identifier' and ./@identifierType='URL'])" /> |
|
67 |
</xsl:otherwise> |
|
68 |
</xsl:choose> |
|
69 |
</xsl:variable> |
|
70 |
|
|
71 |
<xsl:variable name="hostedbyid" |
|
72 |
select="dnet:oafSplitId('datasource', //oaf:hostedBy/@id)" /> |
|
73 |
<xsl:variable name="hostedbyname" select="concat('', //oaf:hostedBy/@name)" /> |
|
74 |
<xsl:variable name="collectedfromid" |
|
75 |
select="dnet:oafSplitId('datasource', //oaf:collectedFrom/@id)" /> |
|
76 |
<xsl:variable name="collectedfromname" |
|
77 |
select="concat('', //oaf:collectedFrom/@name)" /> |
|
78 |
<xsl:variable name="dateOfCollection" |
|
79 |
select="concat('', //dri:dateOfCollection)" /> |
|
80 |
|
|
81 |
<xsl:variable name="result" |
|
82 |
select="dnet:oafResult_FromDatacite($resultId, $dataInfo, $metadata, $titles, |
|
83 |
$subjects, $publisher, $descriptions, $dates, $dateaccepted, $resourceType, |
|
84 |
$formats, $sizes, $language, $cobjcategory, $rights, $version, $hostedbyid, $hostedbyname, |
|
85 |
$collectedfromid, $collectedfromname, $originalid, $instanceURI, $dateOfCollection)" /> |
|
86 |
|
|
87 |
<ROWS> |
|
88 |
<ROW key="{$resultId}" columnFamily="result"> |
|
89 |
<QUALIFIER name="body" type="base64"> |
|
90 |
<xsl:value-of select="$result" /> |
|
91 |
</QUALIFIER> |
|
92 |
</ROW> |
|
93 |
<xsl:for-each select="//*[local-name() = 'creator']"> |
|
94 |
<xsl:variable name="personIdTemp"> |
|
95 |
<xsl:choose> |
|
96 |
<xsl:when |
|
97 |
test="string-length(./*[local-name() = 'nameIdentifier']) > 0"> |
|
98 |
<xsl:value-of |
|
99 |
select="translate(normalize-space(./*[local-name() = 'nameIdentifier']),' .,','___')" /> |
|
100 |
</xsl:when> |
|
101 |
<xsl:otherwise> |
|
102 |
<xsl:value-of |
|
103 |
select="translate(normalize-space(./*[local-name() = 'creatorName']),' .,','___')" /> |
|
104 |
</xsl:otherwise> |
|
105 |
</xsl:choose> |
|
106 |
</xsl:variable> |
|
107 |
<xsl:variable name="personId" |
|
108 |
select="dnet:oafId('person', $namespaceprefix, normalize-space($personIdTemp))" /> |
|
109 |
|
|
110 |
<xsl:variable name="originalPersonId" |
|
111 |
select="./*[local-name() = 'nameIdentifier']" /> |
|
112 |
<xsl:variable name="position" select="position()" /> |
|
113 |
<xsl:if test="string-length($personId) > 0"> |
|
114 |
<xsl:variable name="person" |
|
115 |
select="dnet:oafPerson_FromDatacite($personId, $dataInfo, $collectedfromid, $collectedfromname,$originalPersonId, $dateOfCollection ,normalize-space(./*[local-name() = 'creatorName']))" /> |
|
116 |
|
|
117 |
<xsl:variable name="personresult" |
|
118 |
select="dnet:oafPersonResult_Authorship_FromDatacite($personId, $resultId, $position, 'isAuthorOf', $dataInfo)" /> |
|
119 |
<xsl:variable name="resultperson" |
|
120 |
select="dnet:oafPersonResult_Authorship_FromDatacite($resultId, $personId, $position, 'hasAuthor', $dataInfo)" /> |
|
121 |
<ROW key="{$personId}" columnFamily="person"> |
|
122 |
<QUALIFIER name="body" type="base64"> |
|
123 |
<xsl:value-of select="$person" /> |
|
124 |
</QUALIFIER> |
|
125 |
</ROW> |
|
126 |
<ROW key="{$personId}" columnFamily="personResult_authorship_isAuthorOf"> |
|
127 |
<QUALIFIER name="{$resultId}" type="base64"> |
|
128 |
<xsl:value-of select="$personresult" /> |
|
129 |
</QUALIFIER> |
|
130 |
</ROW> |
|
131 |
<ROW key="{$resultId}" columnFamily="personResult_authorship_hasAuthor"> |
|
132 |
<QUALIFIER name="{$personId}" type="base64"> |
|
133 |
<xsl:value-of select="$resultperson" /> |
|
134 |
</QUALIFIER> |
|
135 |
</ROW> |
|
136 |
</xsl:if> |
|
137 |
</xsl:for-each> |
|
138 |
|
|
139 |
<xsl:for-each select="//*[local-name()='projectid']"> |
|
140 |
|
|
141 |
<xsl:variable name="projectId" |
|
142 |
select="dnet:oafSplitId('project', normalize-space(.))" /> |
|
143 |
|
|
144 |
<xsl:variable name="resultproject" |
|
145 |
select="dnet:oafResultProject_Outcome_FromDatacite($resultId, $projectId, 'isProducedBy', $dataInfo)" /> |
|
146 |
<xsl:variable name="projectresult" |
|
147 |
select="dnet:oafResultProject_Outcome_FromDatacite($projectId, $resultId, 'produces', $dataInfo)" /> |
|
148 |
|
|
149 |
<xsl:if test="string-length($projectId) > 0"> |
|
150 |
<ROW key="{$resultId}" columnFamily="resultProject_outcome_isProducedBy"> |
|
151 |
<QUALIFIER name="{$projectId}" type="base64"> |
|
152 |
<xsl:value-of select="$resultproject" /> |
|
153 |
</QUALIFIER> |
|
154 |
</ROW> |
|
155 |
<ROW key="{$projectId}" columnFamily="resultProject_outcome_produces"> |
|
156 |
<QUALIFIER name="{$resultId}" type="base64"> |
|
157 |
<xsl:value-of select="$projectresult" /> |
|
158 |
</QUALIFIER> |
|
159 |
</ROW> |
|
160 |
</xsl:if> |
|
161 |
</xsl:for-each> |
|
162 |
|
|
163 |
<xsl:for-each select="//*[local-name()='relatedPublication']"> |
|
164 |
|
|
165 |
<!-- relatedDataset ids must be in the openaire format --> |
|
166 |
<xsl:variable name="publicationId" select="dnet:oafSimpleId('result', normalize-space(./@id))"/> |
|
167 |
|
|
168 |
<xsl:if test="string-length($publicationId) > 0"> |
|
169 |
|
|
170 |
<xsl:variable name="resultDataset" select="dnet:oafResultResult_PublicationDataset_FromDatacite($resultId, $publicationId, 'isRelatedTo', $dataInfo)"/> |
|
171 |
<xsl:variable name="datasetResult" select="dnet:oafResultResult_PublicationDataset_FromDatacite($publicationId, $resultId, 'isRelatedTo', $dataInfo)"/> |
|
172 |
|
|
173 |
<ROW key="{$resultId}" columnFamily="resultResult_publicationDataset_isRelatedTo"> |
|
174 |
<QUALIFIER name="{$publicationId}" type="base64"><xsl:value-of select="$resultDataset"/></QUALIFIER> |
|
175 |
</ROW> |
|
176 |
<ROW key="{$publicationId}" columnFamily="resultResult_publicationDataset_isRelatedTo"> |
|
177 |
<QUALIFIER name="{$resultId}" type="base64"><xsl:value-of select="$datasetResult"/></QUALIFIER> |
|
178 |
</ROW> |
|
179 |
</xsl:if> |
|
180 |
</xsl:for-each> |
|
181 |
</ROWS> |
|
182 |
</xsl:if> |
|
183 |
</xsl:otherwise> |
|
184 |
</xsl:choose> |
|
185 |
</xsl:template> |
|
186 |
</xsl:stylesheet> |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/resources/eu/dnetlib/data/transform/simple.xsl | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<xsl:stylesheet version="1.0" |
|
3 |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" |
|
4 |
xmlns:dnet="eu.dnetlib.data.transform.xml.AbstractDNetOafXsltFunctions" |
|
5 |
exclude-result-prefixes="xsl dnet"> |
|
6 |
|
|
7 |
<xsl:output omit-xml-declaration="yes" indent="yes"/> |
|
8 |
<xsl:template match="/*"> |
|
9 |
<xsl:variable name="metadata" select="//*[local-name()='metadata']/*" /> |
|
10 |
<ROWS> |
|
11 |
<xsl:value-of select="dnet:testParse($metadata)"/> |
|
12 |
</ROWS> |
|
13 |
</xsl:template> |
|
14 |
</xsl:stylesheet> |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/transform/ProtoDocumentMapperTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
|
|
6 |
import java.io.IOException; |
|
7 |
import java.io.StringWriter; |
|
8 |
|
|
9 |
import org.apache.commons.codec.binary.Base64; |
|
10 |
import org.apache.commons.io.IOUtils; |
|
11 |
import org.apache.commons.logging.Log; |
|
12 |
import org.apache.commons.logging.LogFactory; |
|
13 |
import org.apache.solr.common.SolrInputDocument; |
|
14 |
import org.apache.solr.common.SolrInputField; |
|
15 |
import org.dom4j.DocumentException; |
|
16 |
import org.junit.Before; |
|
17 |
import org.junit.Test; |
|
18 |
|
|
19 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
20 |
import com.googlecode.protobuf.format.JsonFormat; |
|
21 |
|
|
22 |
import eu.dnetlib.data.mapreduce.util.OafTest; |
|
23 |
import eu.dnetlib.data.proto.KindProtos.Kind; |
|
24 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
25 |
import eu.dnetlib.data.proto.OafProtos.OafEntity; |
|
26 |
import eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory; |
|
27 |
|
|
28 |
public class ProtoDocumentMapperTest { |
|
29 |
|
|
30 |
private static final Log log = LogFactory.getLog(ProtoDocumentMapperTest.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
31 |
private String fields; |
|
32 |
|
|
33 |
@Before |
|
34 |
public void setUp() throws IOException { |
|
35 |
final StringWriter sw = new StringWriter(); |
|
36 |
IOUtils.copy(getClass().getResourceAsStream("fields.xml"), sw); |
|
37 |
fields = sw.toString(); |
|
38 |
assertNotNull(fields); |
|
39 |
assertFalse(fields.isEmpty()); |
|
40 |
|
|
41 |
log.info(fields); |
|
42 |
} |
|
43 |
|
|
44 |
@Test |
|
45 |
public void testProto2SolrDocument() throws DocumentException, InvalidProtocolBufferException { |
|
46 |
final ProtoDocumentMapper mapper = new ProtoDocumentMapper(fields); |
|
47 |
|
|
48 |
assertNotNull(mapper); |
|
49 |
|
|
50 |
final OafEntity.Builder entity = OafTest.getResultBuilder("01"); |
|
51 |
entity.addChildren(OafTest.getResultBuilder("01_children")); |
|
52 |
|
|
53 |
final Oaf oaf = OafTest.embed(entity.build(), Kind.entity).getOaf(); |
|
54 |
|
|
55 |
assertNotNull(oaf.getEntity().getChildrenList()); |
|
56 |
assertFalse(oaf.getEntity().getChildrenList().isEmpty()); |
|
57 |
|
|
58 |
log.info("byte[] size: " + oaf.toByteArray().length); |
|
59 |
|
|
60 |
log.info("json size: " + JsonFormat.printToString(oaf).length()); |
|
61 |
|
|
62 |
log.info("base64 size: " + Base64.encodeBase64String(oaf.toByteArray()).length()); |
|
63 |
|
|
64 |
final byte[] decodeBase64 = Base64.decodeBase64(Base64.encodeBase64String(oaf.toByteArray())); |
|
65 |
|
|
66 |
log.info("decoded: " + JsonFormat.printToString(Oaf.parseFrom(decodeBase64))); |
|
67 |
|
|
68 |
final SolrInputDocument doc = mapper.map(oaf, InputDocumentFactory.getParsedDateField("2015-02-15"), "asd"); |
|
69 |
|
|
70 |
assertNotNull(doc); |
|
71 |
|
|
72 |
for (final SolrInputField f : doc.values()) { |
|
73 |
log.info(f); |
|
74 |
} |
|
75 |
} |
|
76 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
conditions { }, |
|
3 |
model { |
|
4 |
title { algo = JaroWinkler, type = String, weight = 0.5, ignoreMissing = false, path = result/metadata/title/value } |
|
5 |
} |
|
6 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertTrue; |
|
4 |
|
|
5 |
import java.util.List; |
|
6 |
|
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
import com.google.common.collect.Lists; |
|
10 |
|
|
11 |
import eu.dnetlib.pace.AbstractProtoPaceTest; |
|
12 |
import eu.dnetlib.pace.config.Config; |
|
13 |
import eu.dnetlib.pace.model.MapDocument; |
|
14 |
|
|
15 |
public class DetectorTest extends AbstractProtoPaceTest { |
|
16 |
|
|
17 |
@Test |
|
18 |
public void testDistanceResultSimple() { |
|
19 |
final Config config = getResultSimpleConf(); |
|
20 |
|
|
21 |
final MapDocument resA = result(config, "A", "Recent results from CDF"); |
|
22 |
final MapDocument resB = result(config, "B", "Recent results from CDF"); |
|
23 |
|
|
24 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
25 |
System.out.println(String.format(" d ---> %s", d)); |
|
26 |
|
|
27 |
assertTrue(d == 1.0); |
|
28 |
} |
|
29 |
|
|
30 |
@Test |
|
31 |
public void testDistanceResultSimpleMissingDates() { |
|
32 |
final Config config = getResultSimpleConf(); |
|
33 |
|
|
34 |
final MapDocument resA = result(config, "A", "Recent results from BES"); |
|
35 |
final MapDocument resB = result(config, "A", "Recent results from CES"); |
|
36 |
|
|
37 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
38 |
System.out.println(String.format(" d ---> %s", d)); |
|
39 |
|
|
40 |
assertTrue(d > 0.97); |
|
41 |
} |
|
42 |
|
|
43 |
@Test |
|
44 |
public void testDistanceResultInvalidDate() { |
|
45 |
final Config config = getResultConf(); |
|
46 |
|
|
47 |
final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05"); |
|
48 |
final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty"); |
|
49 |
|
|
50 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
51 |
System.out.println(String.format(" d ---> %s", d)); |
|
52 |
|
|
53 |
assertTrue(d == 1.0); |
|
54 |
} |
|
55 |
|
|
56 |
@Test |
|
57 |
public void testDistanceResultMissingOneDate() { |
|
58 |
final Config config = getResultConf(); |
|
59 |
|
|
60 |
final MapDocument resA = result(config, "A", "title title title 6BESR", null); |
|
61 |
final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02"); |
|
62 |
|
|
63 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
64 |
System.out.println(String.format(" d ---> %s", d)); |
|
65 |
|
|
66 |
assertTrue((d > 0.9) && (d < 1.0)); |
|
67 |
} |
|
68 |
|
|
69 |
@Test |
|
70 |
public void testDistanceResult() { |
|
71 |
final Config config = getResultConf(); |
|
72 |
|
|
73 |
final MapDocument resA = result(config, "A", "title title title BES", ""); |
|
74 |
final MapDocument resB = result(config, "B", "title title title CLEO"); |
|
75 |
|
|
76 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
77 |
System.out.println(String.format(" d ---> %s", d)); |
|
78 |
|
|
79 |
// assertTrue(d > 0.9 && d < 1.0); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
public void testDistanceResultMissingTwoDate() { |
|
84 |
final Config config = getResultConf(); |
|
85 |
|
|
86 |
final MapDocument resA = result(config, "A", "title title title 6BESR"); |
|
87 |
final MapDocument resB = result(config, "B", "title title title 6CLER"); |
|
88 |
|
|
89 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
90 |
|
|
91 |
System.out.println(String.format(" d ---> %s", d)); |
|
92 |
|
|
93 |
assertTrue((d > 0.9) && (d < 1.0)); |
|
94 |
} |
|
95 |
|
|
96 |
@Test |
|
97 |
public void testDistanceOrganizationIgnoreMissing() { |
|
98 |
|
|
99 |
final Config config = getOrganizationSimpleConf(); |
|
100 |
|
|
101 |
final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE"); |
|
102 |
final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR"); |
|
103 |
|
|
104 |
final double d = new PaceDocumentDistance().between(orgA, orgB, config); |
|
105 |
System.out.println(String.format(" d ---> %s", d)); |
|
106 |
|
|
107 |
assertTrue(d == 1.0); |
|
108 |
} |
|
109 |
|
|
110 |
@Test |
|
111 |
public void testDistanceResultCase1() { |
|
112 |
|
|
113 |
final Config config = getResultConf(); |
|
114 |
|
|
115 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003"); |
|
116 |
final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003"); |
|
117 |
|
|
118 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
119 |
System.out.println(String.format(" d ---> %s", d)); |
|
120 |
|
|
121 |
assertTrue((d > 0.9) && (d < 1.0)); |
|
122 |
} |
|
123 |
|
|
124 |
@Test |
|
125 |
public void testDistanceResultCaseDoiMatch1() { |
|
126 |
final Config config = getResultConf(); |
|
127 |
|
|
128 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "http://dx.doi.org/10.1594/PANGAEA.726855"); |
|
129 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855"); |
|
130 |
|
|
131 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
132 |
System.out.println(String.format(" d ---> %s", d)); |
|
133 |
|
|
134 |
assertTrue("exact DOIs will produce an exact match", d == 1.0); |
|
135 |
} |
|
136 |
|
|
137 |
@Test |
|
138 |
public void testDistanceResultCaseDoiMatch2() { |
|
139 |
final Config config = getResultConf(); |
|
140 |
|
|
141 |
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "http://dx.doi.org/10.1594/PANGAEA.726855"); |
|
142 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "doi:10.1594/PANGAEA.726855"); |
|
143 |
|
|
144 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
145 |
System.out.println(String.format(" d ---> %s", d)); |
|
146 |
|
|
147 |
assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0); |
|
148 |
} |
|
149 |
|
|
150 |
@Test |
|
151 |
public void testDistanceResultCaseDoiMatch3() { |
|
152 |
final Config config = getResultConf(); |
|
153 |
|
|
154 |
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); |
|
155 |
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003"); |
|
156 |
|
|
157 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
158 |
System.out.println(String.format(" d ---> %s", d)); |
|
159 |
|
|
160 |
assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0); |
|
161 |
} |
|
162 |
|
|
163 |
@Test |
|
164 |
public void testDistanceResultCaseDoiMatch4() { |
|
165 |
final Config config = getResultConf(); |
|
166 |
|
|
167 |
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); |
|
168 |
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005"); |
|
169 |
|
|
170 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
171 |
System.out.println(String.format(" d ---> %s", d)); |
|
172 |
|
|
173 |
assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0); |
|
174 |
} |
|
175 |
|
|
176 |
@Test |
|
177 |
public void testDistanceResultCaseDoiMatch5() { |
|
178 |
|
|
179 |
final Config config = getResultConf(); |
|
180 |
|
|
181 |
final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020"); |
|
182 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003"); |
|
183 |
|
|
184 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
185 |
System.out.println(String.format(" d ---> %s", d)); |
|
186 |
|
|
187 |
assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0)); |
|
188 |
} |
|
189 |
|
|
190 |
@Test |
|
191 |
public void testDistanceResultCaseDoiMatch6() { |
|
192 |
final Config config = getResultConf(); |
|
193 |
|
|
194 |
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); |
|
195 |
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI"); |
|
196 |
|
|
197 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
198 |
System.out.println(String.format(" d ---> %s", d)); |
|
199 |
|
|
200 |
assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d == 0.0); |
|
201 |
} |
|
202 |
|
|
203 |
// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855 |
|
204 |
|
|
205 |
@Test |
|
206 |
public void testDistanceResultCaseAuthor1() { |
|
207 |
|
|
208 |
final Config config = getResultAuthorsConf(); |
|
209 |
|
|
210 |
final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d"); |
|
211 |
final List<String> authorsB = Lists.newArrayList("a", "b", "c"); |
|
212 |
|
|
213 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA); |
|
214 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB); |
|
215 |
|
|
216 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
217 |
System.out.println(String.format(" d ---> %s", d)); |
|
218 |
|
|
219 |
assertTrue(d == 0.0); |
|
220 |
} |
|
221 |
|
|
222 |
@Test |
|
223 |
public void testDistanceResultCaseAuthor2() { |
|
224 |
|
|
225 |
final Config config = getResultAuthorsConf(); |
|
226 |
|
|
227 |
final List<String> authorsA = Lists.newArrayList("a", "b", "c"); |
|
228 |
final List<String> authorsB = Lists.newArrayList("a", "b", "c"); |
|
229 |
|
|
230 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA); |
|
231 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB); |
|
232 |
|
|
233 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
234 |
System.out.println(String.format(" d ---> %s", d)); |
|
235 |
|
|
236 |
assertTrue(d == 1.0); |
|
237 |
} |
|
238 |
|
|
239 |
@Test |
|
240 |
public void testDistanceResultCaseAuthor3() { |
|
241 |
|
|
242 |
final Config config = getResultAuthorsConf(); |
|
243 |
|
|
244 |
final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M."); |
|
245 |
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); |
|
246 |
|
|
247 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA); |
|
248 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB); |
|
249 |
|
|
250 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
251 |
System.out.println(String.format(" d ---> %s", d)); |
|
252 |
|
|
253 |
assertTrue((d > 0.9) && (d < 1.0)); |
|
254 |
} |
|
255 |
|
|
256 |
@Test |
|
257 |
public void testDistanceResultCaseAuthor4() { |
|
258 |
|
|
259 |
final Config config = getResultAuthorsConf(); |
|
260 |
|
|
261 |
final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a"); |
|
262 |
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); |
|
263 |
|
|
264 |
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", null, authorsA); |
|
265 |
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", null, authorsB); |
|
266 |
|
|
267 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
268 |
System.out.println(String.format(" d ---> %s", d)); |
|
269 |
|
|
270 |
// assertTrue(d == 0.0); |
|
271 |
} |
|
272 |
|
|
273 |
@Test |
|
274 |
public void testDistanceResultFullConf() { |
|
275 |
|
|
276 |
final Config config = getResultFullConf(); |
|
277 |
|
|
278 |
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); |
|
279 |
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); |
|
280 |
|
|
281 |
final MapDocument resA = |
|
282 |
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", |
|
283 |
"10.1186/1752-1947-4-299", authorsA); |
|
284 |
final MapDocument resB = |
|
285 |
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", null, |
|
286 |
authorsB); |
|
287 |
|
|
288 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
289 |
System.out.println(String.format(" d ---> %s", d)); |
|
290 |
|
|
291 |
// assertTrue(d == 0.0); |
|
292 |
} |
|
293 |
|
|
294 |
@Test |
|
295 |
public void testDistanceStdl1() { |
|
296 |
|
|
297 |
final Config config = getResultStdlConf(); |
|
298 |
|
|
299 |
final List<String> authorsA = Lists.newArrayList(); |
|
300 |
final List<String> authorsB = |
|
301 |
Lists.newArrayList("Giovannelli A.", "Traversi M. L.", "Sebastiani L.", "Tognetti R.", "d?Andria R.", "Morelli G.", "Fragnito F.", "Lavini A."); |
|
302 |
|
|
303 |
final MapDocument resA = |
|
304 |
result(config, |
|
305 |
"A", |
|
306 |
"Evaluating water use strategies in olive trees grown under different irrigation regimes through integrated approaches between sap flow and high resolution stem growth analysis", |
|
307 |
"2008-01-01", |
|
308 |
null, authorsA); |
|
309 |
final MapDocument resB = |
|
310 |
result(config, |
|
311 |
"B", |
|
312 |
"Evaluating water use strategies in olive trees grown under different irrigation regimes through integrated approaches between sap flow and high resolution stem growth analysis", |
|
313 |
"2008-01-01", |
|
314 |
null, |
|
315 |
authorsB); |
|
316 |
|
|
317 |
final double d = new PaceDocumentDistance().between(resA, resB, config); |
|
318 |
System.out.println(String.format(" d ---> %s", d)); |
|
319 |
|
|
320 |
// assertTrue(d == 0.0); |
|
321 |
} |
|
322 |
|
|
323 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
clustering { |
|
3 |
acronyms { fields = [title, desc], params = { max = 1, minLen = 2, maxLen = 4} }, |
|
4 |
ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, |
|
5 |
suffixprefix { fields = [title], params = { max = 1, len = 3 } } |
|
6 |
}, |
|
7 |
conditions { |
|
8 |
titleVersionMatch { fields = [title] }, |
|
9 |
sizeMatch { fields = [authors] } |
|
10 |
}, |
|
11 |
model { |
|
12 |
title { algo = JaroWinkler, type = String, weight = 0.5, ignoreMissing = false, path = result/metadata/title/value }, |
|
13 |
authors { algo = SortedLevel2JaroWinkler, type = List, weight = 0.5, ignoreMissing = true, path = result/author/metadata/fullname/value } |
|
14 |
}, |
|
15 |
blacklists = { |
|
16 |
title = [ |
|
17 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
18 |
"^(Kiri Karl Morgensternile).*$", |
|
19 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
20 |
"^(\\[Eksliibris Aleksandr).*$", |
|
21 |
"^(Eksliibris Aleksandr).*$", |
|
22 |
"^(Kiri A\\. de Vignolles).*$", |
|
23 |
"^(2 kirja Karl Morgensternile).*$", |
|
24 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
25 |
"^(Kiri tundmatule).*$", |
|
26 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
27 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
28 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
29 |
"^(WHP Cruise Summary Information of section).*$", |
|
30 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
31 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
32 |
] } |
|
33 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import org.junit.Test; |
|
4 |
|
|
5 |
import eu.dnetlib.pace.AbstractProtoPaceTest; |
|
6 |
import eu.dnetlib.pace.config.Config; |
|
7 |
|
|
8 |
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { |
|
9 |
|
|
10 |
@Test |
|
11 |
public void test1() { |
|
12 |
|
|
13 |
String id = "12345"; |
|
14 |
|
|
15 |
Config config = getResultConf(); |
|
16 |
|
|
17 |
MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.fields()); |
|
18 |
|
|
19 |
System.out.println(document); |
|
20 |
|
|
21 |
String stringDoc = MapDocumentSerializer.toString(document); |
|
22 |
|
|
23 |
System.out.println(stringDoc); |
|
24 |
|
|
25 |
MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); |
|
26 |
|
|
27 |
System.out.println(decoded); |
|
28 |
} |
|
29 |
|
|
30 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/OafEntityMerger.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import com.google.common.base.Predicate; |
|
8 |
import com.google.common.collect.Iterables; |
|
9 |
import com.google.common.collect.Lists; |
|
10 |
import com.google.common.collect.Maps; |
|
11 |
import com.google.common.collect.Sets; |
|
12 |
import com.google.protobuf.Descriptors.FieldDescriptor; |
|
13 |
import com.google.protobuf.Message.Builder; |
|
14 |
|
|
15 |
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; |
|
16 |
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; |
|
17 |
import eu.dnetlib.data.proto.FieldTypeProtos.StringField; |
|
18 |
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; |
|
19 |
import eu.dnetlib.data.proto.KindProtos.Kind; |
|
20 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
|
21 |
import eu.dnetlib.data.proto.OafProtos.OafEntity; |
|
22 |
import eu.dnetlib.data.proto.PersonProtos.Person; |
|
23 |
import eu.dnetlib.data.proto.ResultProtos.Result; |
|
24 |
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust; |
|
25 |
import eu.dnetlib.pace.util.DedupConfig; |
|
26 |
|
|
27 |
public class OafEntityMerger { |
|
28 |
|
|
29 |
private static final String DEDUP_CLASSID = "sysimport:dedup"; |
|
30 |
|
|
31 |
private static final String DNET_PROVENANCE_SCHEME = "dnet:provenanceActions"; |
|
32 |
|
|
33 |
private final Predicate<StringField> skipEmptyStringField = new Predicate<StringField>() { |
|
34 |
|
|
35 |
@Override |
|
36 |
public boolean apply(final StringField s) { |
|
37 |
return (s != null) && (s.getValue() != null) && !s.getValue().isEmpty(); |
|
38 |
} |
|
39 |
}; |
|
40 |
|
|
41 |
private final Predicate<String> skipEmptyString = new Predicate<String>() { |
|
42 |
|
|
43 |
@Override |
|
44 |
public boolean apply(final String s) { |
|
45 |
return (s != null) && !s.isEmpty(); |
|
46 |
} |
|
47 |
}; |
|
48 |
|
|
49 |
public static Oaf.Builder merge(final String id, final Iterable<Oaf> entities) { |
|
50 |
return merge(null, id, entities); |
|
51 |
} |
|
52 |
|
|
53 |
public static Oaf.Builder merge(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) { |
|
54 |
return new OafEntityMerger().mergeEntities(dedupConf, id, entities); |
|
55 |
} |
|
56 |
|
|
57 |
public static Oaf.Builder merge(final Oaf.Builder builder) { |
|
58 |
return new OafEntityMerger().doMergeEntities(builder); |
|
59 |
} |
|
60 |
|
|
61 |
public Oaf.Builder mergeEntities(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) { |
|
62 |
|
|
63 |
Oaf.Builder builder = Oaf.newBuilder(); |
|
64 |
String trust = "0.0"; |
|
65 |
for (final Oaf oaf : TrustOrdering.sort(entities)) { |
|
66 |
// doublecheck we're dealing only with main entities |
|
67 |
if (!oaf.getKind().equals(Kind.entity)) throw new IllegalArgumentException("expected OafEntity!"); |
|
68 |
|
|
69 |
final String currentTrust = oaf.getDataInfo().getTrust(); |
|
70 |
if (!currentTrust.equals(SpecialTrust.NEUTRAL.toString())) { |
|
71 |
trust = currentTrust; |
|
72 |
} |
|
73 |
builder.mergeFrom(oaf); |
|
74 |
} |
|
75 |
|
|
76 |
builder = doMergeEntities(builder); |
|
77 |
builder.getEntityBuilder().setId(id); |
|
78 |
builder.getDataInfoBuilder() |
|
79 |
.setInferred(true) |
|
80 |
.setDeletedbyinference(false) |
|
81 |
.setTrust(trust) |
|
82 |
.setProvenanceaction(getProvenanceAction()); |
|
83 |
|
|
84 |
if ((dedupConf != null) && dedupConf.isIncludeChildren()) { |
|
85 |
for (final Oaf oaf : Iterables.limit(entities, dedupConf.getMaxChildren())) { |
|
86 |
builder.getEntityBuilder().addChildren(oaf.getEntity()); |
|
87 |
} |
|
88 |
} |
|
89 |
|
|
90 |
return builder; |
|
91 |
} |
|
92 |
|
|
93 |
private Qualifier.Builder getProvenanceAction() { |
|
94 |
return Qualifier.newBuilder().setClassid(DEDUP_CLASSID).setClassname(DEDUP_CLASSID).setSchemeid(DNET_PROVENANCE_SCHEME) |
|
95 |
.setSchemename(DNET_PROVENANCE_SCHEME); |
|
96 |
} |
|
97 |
|
|
98 |
public Oaf.Builder doMergeEntities(final Oaf.Builder builder) { |
|
99 |
|
|
100 |
switch (builder.getEntity().getType()) { |
|
101 |
case datasource: |
|
102 |
break; |
|
103 |
case organization: |
|
104 |
break; |
|
105 |
case person: |
|
106 |
final Person.Metadata.Builder person = builder.getEntityBuilder().getPersonBuilder().getMetadataBuilder(); |
|
107 |
for (final String field : Lists.newArrayList("secondnames")) { |
|
108 |
setSingleString(person, field); |
|
109 |
} |
|
110 |
break; |
|
111 |
case project: |
|
112 |
break; |
|
113 |
case result: |
|
114 |
final Result.Metadata.Builder result = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder(); |
|
115 |
setTitle(result); |
|
116 |
|
|
117 |
// for (String field : Lists.newArrayList("subject", "relevantdate")) { |
|
118 |
for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SUBJECT_FIELD_NUMBER, |
|
119 |
Result.Metadata.RELEVANTDATE_FIELD_NUMBER)) { |
|
120 |
setStructuredProperty(result, field); |
|
121 |
} |
|
122 |
for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.DESCRIPTION_FIELD_NUMBER)) { |
|
123 |
setLongestStringField(result, field); |
|
124 |
} |
|
125 |
for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SOURCE_FIELD_NUMBER)) { |
|
126 |
setUniqueStringField(result, field); |
|
127 |
} |
|
128 |
for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.COLLECTEDFROM_FIELD_NUMBER)) { |
|
129 |
setKeyValues(builder.getEntityBuilder(), field); |
|
130 |
} |
|
131 |
for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.PID_FIELD_NUMBER)) { |
|
132 |
setStructuredProperty(builder.getEntityBuilder(), field); |
|
133 |
} |
|
134 |
for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.ORIGINALID_FIELD_NUMBER)) { |
|
135 |
setUniqueString(builder.getEntityBuilder(), field); |
|
136 |
} |
|
137 |
|
|
138 |
// remove the inner authors, rely on the children |
|
139 |
builder.getEntityBuilder().getResultBuilder().clearAuthor(); |
|
140 |
break; |
|
141 |
default: |
|
142 |
break; |
|
143 |
} |
|
144 |
return builder; |
|
145 |
} |
|
146 |
|
|
147 |
/** |
|
148 |
* Helper method, avoid duplicated StructuredProperties in the given builder for the given fieldName |
|
149 |
* |
|
150 |
* @param builder |
|
151 |
* @param fieldName |
|
152 |
*/ |
|
153 |
@SuppressWarnings("unchecked") |
|
154 |
private void setStructuredProperty(final Builder builder, final String fieldName) { |
|
155 |
final Map<String, StructuredProperty> map = Maps.newHashMap(); |
|
156 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
157 |
final List<StructuredProperty> sps = (List<StructuredProperty>) builder.getField(fd); |
|
158 |
|
|
159 |
if ((sps != null) && !sps.isEmpty()) { |
|
160 |
for (final StructuredProperty sp : sps) { |
|
161 |
map.put(sp.getValue(), sp); |
|
162 |
} |
|
163 |
|
|
164 |
if (!map.isEmpty()) { |
|
165 |
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values())); |
|
166 |
} |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
/** |
|
171 |
* Helper method, avoid duplicated KeyValues in the given builder for the given fieldName |
|
172 |
* |
|
173 |
* @param builder |
|
174 |
* @param fieldName |
|
175 |
*/ |
|
176 |
@SuppressWarnings("unchecked") |
|
177 |
private void setKeyValues(final Builder builder, final String fieldName) { |
|
178 |
final Map<String, KeyValue> map = Maps.newHashMap(); |
|
179 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
180 |
final List<KeyValue> kvs = (List<KeyValue>) builder.getField(fd); |
|
181 |
|
|
182 |
if ((kvs != null) && !kvs.isEmpty()) { |
|
183 |
for (final KeyValue sp : kvs) { |
|
184 |
map.put(sp.getKey(), sp); |
|
185 |
} |
|
186 |
|
|
187 |
if (!map.isEmpty()) { |
|
188 |
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values())); |
|
189 |
} |
|
190 |
} |
|
191 |
} |
|
192 |
|
|
193 |
@SuppressWarnings("unchecked") |
|
194 |
private void setSingleString(final Builder builder, final String fieldName) { |
|
195 |
|
|
196 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
197 |
final List<StringField> field = (List<StringField>) builder.getField(fd); |
|
198 |
if ((field != null) && !field.isEmpty()) { |
|
199 |
final StringField s = (StringField) Iterables.getLast(Iterables.filter(field, skipEmptyStringField), ""); |
|
200 |
|
|
201 |
if ((s != null) && (s.getValue() != null) && !s.getValue().isEmpty()) { |
|
202 |
builder.clearField(fd).setField(fd, Lists.newArrayList(s)); |
|
203 |
} |
|
204 |
} |
|
205 |
} |
|
206 |
|
|
207 |
@SuppressWarnings("unchecked") |
|
208 |
private void setLongestStringField(final Builder builder, final String fieldName) { |
|
209 |
|
|
210 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
211 |
final List<StringField> field = (List<StringField>) builder.getField(fd); |
|
212 |
|
|
213 |
if ((field != null) && !field.isEmpty()) { |
|
214 |
final StringField.Builder max = StringField.newBuilder().setValue(""); |
|
215 |
int maxLength = 0; |
|
216 |
for (final StringField sf : field) { |
|
217 |
if (sf.getValue().length() > maxLength) { |
|
218 |
maxLength = sf.getValue().length(); |
|
219 |
max.clear(); |
|
220 |
max.mergeFrom(sf); |
|
221 |
} |
|
222 |
} |
|
223 |
|
|
224 |
builder.clearField(fd).setField(fd, Lists.newArrayList(max.build())); |
|
225 |
} |
|
226 |
} |
|
227 |
|
|
228 |
@SuppressWarnings("unchecked") |
|
229 |
private void setUniqueStringField(final Builder builder, final String fieldName) { |
|
230 |
|
|
231 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
232 |
final List<StringField> field = (List<StringField>) builder.getField(fd); |
|
233 |
final Map<String, StringField> map = Maps.newHashMap(); |
|
234 |
if ((field != null) && !field.isEmpty()) { |
|
235 |
for (final StringField s : Iterables.filter(field, skipEmptyStringField)) { |
|
236 |
map.put(s.getValue(), s); |
|
237 |
} |
|
238 |
|
|
239 |
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values())); |
|
240 |
} |
|
241 |
} |
|
242 |
|
|
243 |
@SuppressWarnings("unchecked") |
|
244 |
private void setUniqueString(final Builder builder, final String fieldName) { |
|
245 |
|
|
246 |
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName); |
|
247 |
final List<String> field = (List<String>) builder.getField(fd); |
|
248 |
final Set<String> set = Sets.newHashSet(); |
|
249 |
if ((field != null) && !field.isEmpty()) { |
|
250 |
for (final String s : Iterables.filter(field, skipEmptyString)) { |
|
251 |
set.add(s); |
|
252 |
} |
|
253 |
|
|
254 |
builder.clearField(fd).setField(fd, Lists.newArrayList(set)); |
|
255 |
} |
|
256 |
} |
|
257 |
|
|
258 |
private void setTitle(final Result.Metadata.Builder metadata) { |
|
259 |
final Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), OafUtils.mainTitleFilter()); |
|
260 |
|
|
261 |
if (!Iterables.isEmpty(filtered)) { |
|
262 |
metadata.clearTitle().addTitle(Iterables.getLast(filtered)); |
|
263 |
} |
|
264 |
} |
|
265 |
|
|
266 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/ProtoDocumentMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import java.io.StringReader; |
|
4 |
|
|
5 |
import org.apache.commons.codec.binary.Base64; |
|
6 |
import org.apache.commons.lang.StringUtils; |
|
7 |
import org.apache.solr.common.SolrInputDocument; |
|
8 |
import org.dom4j.Document; |
|
9 |
import org.dom4j.DocumentException; |
|
10 |
import org.dom4j.Element; |
|
11 |
import org.dom4j.io.SAXReader; |
|
12 |
|
|
13 |
import com.google.common.base.Splitter; |
|
14 |
import com.google.common.collect.Lists; |
|
15 |
import com.google.protobuf.GeneratedMessage; |
|
16 |
|
|
17 |
/** |
|
18 |
* The Class ProtoDocumentMapper. |
|
19 |
*/ |
|
20 |
public class ProtoDocumentMapper extends AbstractProtoMapper { |
|
21 |
|
|
22 |
/** The fields. */ |
|
23 |
private Document fields; |
|
24 |
|
|
25 |
/** |
|
26 |
* Instantiates a new proto document mapper. |
|
27 |
* |
|
28 |
* @param fields |
|
29 |
* the fields |
|
30 |
* @throws DocumentException |
|
31 |
* the document exception |
|
32 |
*/ |
|
33 |
public ProtoDocumentMapper(final String fields) throws DocumentException { |
|
34 |
this.fields = parse(fields); |
|
35 |
|
|
36 |
if (StringUtils.isBlank(this.fields.valueOf("//FIELD[@name = 'objIdentifier']/@name"))) |
|
37 |
throw new IllegalArgumentException("field objIdentifier is mandatory"); |
|
38 |
} |
|
39 |
|
|
40 |
/** |
|
41 |
* Map. |
|
42 |
* |
|
43 |
* @param proto |
|
44 |
* the proto |
|
45 |
* @param version |
|
46 |
* the version |
|
47 |
* @param dsId |
|
48 |
* the ds id |
|
49 |
* @return the solr input document |
|
50 |
* @throws DocumentException |
|
51 |
* the document exception |
|
52 |
*/ |
|
53 |
public SolrInputDocument map(final GeneratedMessage proto, final String version, final String dsId) throws DocumentException { |
|
54 |
|
|
55 |
final SolrInputDocument doc = new SolrInputDocument(); |
|
56 |
|
|
57 |
for (final Object o : fields.selectNodes("//FIELD")) { |
|
58 |
final Element e = (Element) o; |
|
59 |
|
|
60 |
final String name = e.attribute("name").getValue().toLowerCase().trim(); |
|
61 |
final String path = e.attribute("path").getValue(); |
|
62 |
|
|
63 |
doc.setField(name, processMultiPath(proto, Lists.newLinkedList(Splitter.on("|").trimResults().split(path)))); |
|
64 |
} |
|
65 |
|
|
66 |
doc.setField("__dsid", dsId); |
|
67 |
doc.setField("__dsversion", version); |
|
68 |
doc.setField("objidentifier", patchId((String) doc.getFieldValue("objidentifier"))); |
|
69 |
doc.setField("__indexrecordidentifier", doc.getFieldValue("objidentifier")); |
|
70 |
doc.setField("__result", Base64.encodeBase64String(proto.toByteArray())); |
|
71 |
|
|
72 |
return doc; |
|
73 |
} |
|
74 |
|
|
75 |
/** |
|
76 |
* Patch the objidentifier: when it comes from HBase, i.e. contains the separator '|' returns the string that follows. |
|
77 |
* |
|
78 |
* @param objidentifier |
|
79 |
* the objidentifier |
|
80 |
* @return the string |
|
81 |
*/ |
|
82 |
private String patchId(final String objidentifier) { |
|
83 |
return objidentifier.contains("|") ? StringUtils.substringAfter(objidentifier, "|") : objidentifier; |
|
84 |
} |
|
85 |
|
|
86 |
/** |
|
87 |
* Parses the. |
|
88 |
* |
|
89 |
* @param s |
|
90 |
* the s |
|
91 |
* @return the document |
|
92 |
* @throws DocumentException |
|
93 |
* the document exception |
|
94 |
*/ |
|
95 |
private Document parse(final String s) throws DocumentException { |
|
96 |
return new SAXReader().read(new StringReader(s)); |
|
97 |
} |
|
98 |
|
|
99 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-hadoop-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaireplus-mapping-utils</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>3.0.7-SNAPSHOT</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-openaireplus-mapping-utils/trunk</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>com.google.guava</groupId> |
|
20 |
<artifactId>guava</artifactId> |
|
21 |
<version>${google.guava.version}</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>junit</groupId> |
|
25 |
<artifactId>junit</artifactId> |
|
26 |
<version>${junit.version}</version> |
|
27 |
<scope>test</scope> |
|
28 |
</dependency> |
|
29 |
<dependency> |
|
30 |
<groupId>commons-codec</groupId> |
|
31 |
<artifactId>commons-codec</artifactId> |
|
32 |
<version>${commons.codec.version}</version> |
|
33 |
</dependency> |
|
34 |
<dependency> |
|
35 |
<groupId>dom4j</groupId> |
|
36 |
<artifactId>dom4j</artifactId> |
|
37 |
<version>${dom4j.version}</version> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>eu.dnetlib</groupId> |
|
41 |
<artifactId>dnet-openaire-data-protos</artifactId> |
|
42 |
<version>[3.0.0,4.0.0)</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>eu.dnetlib</groupId> |
|
46 |
<artifactId>dnet-pace-core</artifactId> |
|
47 |
<version>[1.3.0,2.0.0)</version> |
|
48 |
</dependency> |
|
49 |
<dependency> |
|
50 |
<groupId>eu.dnetlib</groupId> |
|
51 |
<artifactId>cnr-misc-utils</artifactId> |
|
52 |
<version>[1.0.0,2.0.0)</version> |
|
53 |
</dependency> |
|
54 |
<dependency> |
|
55 |
<groupId>eu.dnetlib</groupId> |
|
56 |
<artifactId>dnet-hadoop-commons</artifactId> |
|
57 |
<version>[1.0.0,2.0.0)</version> |
|
58 |
</dependency> |
|
59 |
<dependency> |
|
60 |
<groupId>eu.dnetlib</groupId> |
|
61 |
<artifactId>dnet-index-solr-common</artifactId> |
|
62 |
<version>[1.0.0,2.0.0)</version> |
|
63 |
</dependency> |
|
64 |
<dependency> |
|
65 |
<groupId>com.googlecode.protobuf-java-format</groupId> |
|
66 |
<artifactId>protobuf-java-format</artifactId> |
|
67 |
<version>1.2</version> |
|
68 |
<scope>test</scope> |
|
69 |
</dependency> |
|
70 |
</dependencies> |
|
71 |
</project> |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/test/java/eu/dnetlib/data/mapreduce/util/OafRelDecoderTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.util; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
|
|
6 |
import org.junit.Before; |
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
import com.google.protobuf.Descriptors.FieldDescriptor; |
|
10 |
|
|
11 |
import eu.dnetlib.data.proto.OafProtos.OafRel; |
|
12 |
import eu.dnetlib.data.proto.PersonResultProtos.PersonResult.Authorship; |
|
13 |
import eu.dnetlib.data.proto.RelTypeProtos.RelType; |
|
14 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
|
15 |
|
|
16 |
public class OafRelDecoderTest { |
|
17 |
|
|
18 |
private OafRel oafRel; |
|
19 |
|
|
20 |
@Before |
|
21 |
public void setUp() { |
|
22 |
oafRel = OafTest.getPersonResult("ID_1", "ID_2", "1", "isAuthor"); |
|
23 |
} |
|
24 |
|
|
25 |
@Test |
|
26 |
public void testSetClass() { |
|
27 |
|
|
28 |
OafRelDecoder d1 = OafRelDecoder.decode(oafRel); |
|
29 |
|
|
30 |
assertNotNull(d1); |
|
31 |
assertEquals("isAuthor", d1.getRelClass()); |
|
32 |
|
|
33 |
OafRelDecoder d2 = OafRelDecoder.decode(d1.setClassId("hasAuthor").build()); |
|
34 |
|
|
35 |
assertEquals("hasAuthor", d2.getRelClass()); |
|
36 |
assertEquals("hasAuthor", d2.getRelMetadata().getSemantics().getClassid()); |
|
37 |
assertEquals("hasAuthor", d2.getRelMetadata().getSemantics().getClassname()); |
|
38 |
|
|
39 |
FieldDescriptor fd = Authorship.getDescriptor().findFieldByName("ranking"); |
|
40 |
assertEquals(d1.getSubRel().getField(fd), d2.getSubRel().getField(fd)); |
|
41 |
} |
|
42 |
|
|
43 |
@Test |
|
44 |
public void testGetCF() { |
|
45 |
assertEquals("personResult_authorship_isAuthorOf", OafRelDecoder.getCFQ(RelType.personResult, SubRelType.authorship, Authorship.RelName.isAuthorOf)); |
|
46 |
assertEquals("personResult_authorship_isAuthorOf", OafRelDecoder.getCFQ(RelType.personResult, SubRelType.authorship, "isAuthorOf")); |
|
47 |
} |
|
48 |
|
|
49 |
} |
modules/dnet-openaireplus-mapping-utils/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.transform; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import org.apache.commons.lang.StringUtils; |
|
6 |
|
|
7 |
import com.google.common.base.Splitter; |
|
8 |
import com.google.common.collect.Lists; |
|
9 |
import com.google.protobuf.Descriptors.EnumValueDescriptor; |
|
10 |
import com.google.protobuf.Descriptors.FieldDescriptor; |
|
11 |
import com.google.protobuf.GeneratedMessage; |
|
12 |
|
|
13 |
/** |
|
14 |
* AbstractProtoMapper provide common navigation methods on the protocolbuffers Messages. |
|
15 |
* |
|
16 |
* @author claudio |
|
17 |
*/ |
|
18 |
public abstract class AbstractProtoMapper { |
|
19 |
|
|
20 |
/** The Constant PATH_SEPARATOR. */ |
|
21 |
private static final String PATH_SEPARATOR = "/"; |
|
22 |
|
|
23 |
/** |
|
24 |
* Process multi path. |
|
25 |
* |
|
26 |
* @param proto |
|
27 |
* the proto |
|
28 |
* @param paths |
|
29 |
* the paths |
|
30 |
* @return the list |
|
31 |
*/ |
|
32 |
protected List<Object> processMultiPath(final GeneratedMessage proto, final List<String> paths) { |
|
33 |
final List<Object> response = Lists.newArrayList(); |
|
34 |
for (final String pathElements : paths) { |
|
35 |
response.addAll(processPath(proto, pathElements)); |
|
36 |
} |
|
37 |
return response; |
|
38 |
} |
|
39 |
|
|
40 |
/** |
|
41 |
* Process path. |
|
42 |
* |
|
43 |
* @param proto |
|
44 |
* the proto |
|
45 |
* @param path |
|
46 |
* the path |
|
47 |
* @return the list |
|
48 |
*/ |
|
49 |
protected List<Object> processPath(final GeneratedMessage proto, final String path) { |
|
50 |
return processPath(proto, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path))); |
|
51 |
} |
|
52 |
|
Also available in: Unified diff
branch for testing the upgrade to CDH 5.3.X