mapper.py
</>复制代码
#!/usr/bin/env python
"""A more advanced Mapper, using Python iterators and generators."""
import sys
def read_input(file):
for line in file:
# split the line into words
yield line.split()
def main(separator="
"):
# input comes from STDIN (standard input)
data = read_input(sys.stdin)
for words in data:
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
for word in words:
print "%s%s%d" % (word, separator, 1)
if __name__ == "__main__":
main()
reducer.py
</>复制代码
#!/usr/bin/env python
"""A more advanced Reducer, using Python iterators and generators."""
from itertools import groupby
from operator import itemgetter
import sys
def read_mapper_output(file, separator="
"):
for line in file:
yield line.rstrip().split(separator, 1)
def main(separator="
"):
# input comes from STDIN (standard input)
data = read_mapper_output(sys.stdin, separator=separator)
# groupby groups multiple word-count pairs by word,
# and creates an iterator that returns consecutive keys and their group:
# current_word - string containing a word (the key)
# group - iterator yielding all ["", ""] items
for current_word, group in groupby(data, itemgetter(0)):
try:
total_count = sum(int(count) for current_word, count in group)
print "%s%s%d" % (current_word, separator, total_count)
except ValueError:
# count was not a number, so silently discard this item
pass
if __name__ == "__main__":
main()
转自:http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/45308.html
摘要: Caching Libraries for caching data. Beaker - A library for caching and sessions for use with web applications and stand-alone Python scripts and applications. dogpile.cache - dogpile.cache...
阅读 3199·2021-11-24 10:34
阅读 3423·2021-11-22 13:53
阅读 2745·2021-11-22 12:03
阅读 3742·2021-09-26 09:47
阅读 3113·2021-09-23 11:21
阅读 4991·2021-09-22 15:08
阅读 3479·2021-07-23 10:59
阅读 1335·2019-08-29 18:31