from pyspark.sql.functions import udf, col, lower, regexp_replace, concat_ws, trim from pyspark.ml.feature import Tokenizer, StopWordsRemover from pyspark.sql.types import ArrayType, StringType, IntegerType file_location = "dbfs:///FileStore/tables/all-news/*.csv" file_type = "csv" # CSV options infer_schema = "false" first_row_is_header = "true" delimiter = "," # The applied options are for CSV files. For other file types, these will be ignored. df_file = spark.read.format(file_type) \ .option("inferSchema", infer_schema) \ .option("header", first_row_is_header) \ .option("sep", delimiter) \ .load(file_location) \ .select('id', 'title', 'content') \ .na.drop() # Delete punctuation df_cleaned = df_file.select('id', (lower(regexp_replace('title', "[^a-zA-Z\\s]", " ")).alias('title')), \ (lower(regexp_replace('content', "[^a-zA-Z\\s]", " ")).alias('content'))) df_cleaned = df_cleaned.select('id', (regexp_replace('title', "[!-~]?\\b[\\w]\\b[!-~]?", " ")).alias('title'), \ (regexp_replace('content', "[!-~]?\\b[\\w]\\b[!-~]?", " ")).alias('content')) df_cleaned = df_cleaned.select('id', (regexp_replace(trim(col('title')), " +", " ")).alias('title'), \ (regexp_replace(trim(col('content')), " +", " ")).alias('content')) # Tokenize title title_tokenizer = Tokenizer(inputCol='title', outputCol='tokenized_title') df_tokenized_title = title_tokenizer.transform(df_cleaned).select('id', 'tokenized_title', 'content') # Remove stopwords from title stopwords_title_remover = StopWordsRemover(inputCol='tokenized_title', outputCol='cleaned_title') df_title_removed_stopwords = stopwords_title_remover.transform(df_tokenized_title).select('id', 'cleaned_title', 'content') # Clean words whose lenght is less than 1 filter_length_udf = udf(lambda row: [x for x in row if len(x) > 1], ArrayType(StringType())) df_final_title = df_title_removed_stopwords.withColumn('cleaned_title', filter_length_udf(col('cleaned_title'))) # Tokenize content content_tokenizer = Tokenizer(inputCol='content', outputCol='tokenized_content') df_tokenized_content = content_tokenizer.transform(df_final_title).select('id', 'cleaned_title', 'tokenized_content') # Remove stopwords from content stopwords_remover = StopWordsRemover(inputCol='tokenized_content', outputCol='cleaned_content') df_removed_stopwords = stopwords_remover.transform(df_tokenized_content).select('id', 'cleaned_title', 'cleaned_content') # Filter length in content df_final = df_removed_stopwords.withColumn('cleaned_content', filter_length_udf(col('cleaned_content'))) # Make title and content strings and id an integer df_final = df_final.withColumn('cleaned_title', concat_ws(" ", 'cleaned_title')) \ .withColumn('cleaned_content', concat_ws(" ", 'cleaned_content')) \ .withColumn('id', df_final['id'].cast(IntegerType())) \ .select('id', col('cleaned_title').alias('title'), col('cleaned_content').alias('content')) display(df_final)
articles_rdd = df_final.rdd.map(lambda x: (x['id'], x['title'], x['content'])) print(articles_rdd.take(5))
[(17283, 'house republicans fret winning health care suit new york times', 'washington congressional republicans new fear comes health care lawsuit obama administration might win incoming trump administration choose longer defend executive branch suit challenges administration authority spend billions dollars health insurance subsidies americans handing house republicans big victory issues sudden loss disputed subsidies conceivably cause health care program implode leaving millions people without access health insurance republicans prepared replacement lead chaos insurance market spur political backlash republicans gain full control government stave outcome republicans find awkward position appropriating huge sums temporarily prop obama health care law angering conservative voters demanding end law years another twist donald trump administration worried preserving executive branch prerogatives choose fight republican allies house central questions dispute eager avoid ugly political pileup republicans capitol hill trump transition team gaming handle lawsuit election put limbo least late february united states court appeals district columbia circuit yet ready divulge strategy given pending litigation involves obama administration congress inappropriate comment said phillip blando spokesman trump transition effort upon taking office trump administration evaluate case related aspects affordable care act potentially decision judge rosemary collyer ruled house republicans standing sue executive branch spending dispute obama administration distributing health insurance subsidies violation constitution without approval congress justice department confident judge collyer decision reversed quickly appealed subsidies remained place appeal successfully seeking temporary halt proceedings mr trump won house republicans last month told court transition team currently discussing potential options resolution matter take effect inauguration jan suspension case house lawyers said provide future administration time consider whether continue prosecuting otherwise resolve appeal republican leadership officials house acknowledge possibility cascading effects payments totaled estimated billion suddenly stopped insurers receive subsidies exchange paying costs deductibles eligible consumers race drop coverage since losing money loss subsidies destabilize entire program cause lack confidence leads insurers seek quick exit well anticipating trump administration might inclined mount vigorous fight house republicans given dim view health care law team lawyers month sought intervene case behalf two participants health care program request lawyers predicted deal house republicans new administration dismiss settle case produce devastating consequences individuals receive reductions well nation health insurance health care systems generally matter happens house republicans say want prevail two overarching concepts congressional power purse right congress sue executive branch violates constitution regarding spending power house republicans contend congress never appropriated money subsidies required constitution suit initially championed john boehner house speaker time later house committee reports republicans asserted administration desperate funding required treasury department provide despite widespread internal skepticism spending proper white house said spending permanent part law passed annual appropriation required even though administration initially sought one important house republicans judge collyer found congress standing sue white house issue ruling many legal experts said flawed want precedent set restore congressional leverage executive branch spending power standing trump administration may come pressure advocates presidential authority fight house matter shared views health care since precedents broad repercussions complicated set dynamics illustrating quick legal victory house trump era might come costs republicans never anticipated took obama white house'), (17284, 'rift officers residents killings persist south bronx new york times', 'bullet shells get counted blood dries votive candles burn people peer windows see crime scenes gone cold band yellow police tape blowing breeze south bronx across harlem river manhattan shorthand urban dysfunction still suffers violence levels long ago slashed many parts new york city yet city efforts fight remain splintered underfunded burdened scandal th precinct southern tip bronx poor minority neighborhoods across country people long hounded infractions crying protection grievous injury death september four every five shootings precinct year unsolved city precincts th highest murder rate fewest detectives per violent crime reflecting disparities staffing hit hardest neighborhoods outside manhattan according new york times analysis police department data investigators precinct saddled twice number cases department recommends even bosses called police headquarters answer sharpest crime rise city year across bronx investigative resources squeezed highest rate city five boroughs thinnest detective staffing nine precinct detective squads violent crime city borough robbery squad smaller manhattan even though bronx cases year homicide squad one detective every four murders compared one detective roughly every two murders upper manhattan one detective per murder lower manhattan lobbies family apartments outside methadone clinics art studios people take note inequity hear police commanders explain lack resources place floodlight dangerous block post officers corner watch witnesses cower behind doors fearful gunman crew confident police department ability protect though people see lot rarely testify south bronx many predominantly black hispanic neighborhoods like united states contract police community tatters people stories crime reports ignored calls went unanswered hours others tell call help ending caller arrest minor charge leading hours fetid holding cell paradox policing th precinct neighborhoods historically prime targets aggressive tactics like designed ward disorder precinct detectives less time anywhere else city answer blood spilled violent crimes gola white beside daughter shot killed playground summer four years son gunned housing project ticked public safety resources said scant bronx neighborhoods like security cameras lights locks investigating police officers nothing said comes families said authorities really care much feel times documenting murders logged year th precinct one handful neighborhoods deadly violence remains problem era crime new york city homicides precinct year nine strain detectives go unsolved half year look take law hands hundreds conversations grieving relatives friends witnesses police officers social forces flare murder place like th precinct become clearer merciless gang codes mental illness drugs long memories feuds simmered officers view reasons murders never solved also emerge paralyzing fear retribution victims carrying secrets graves relentless casework forces detectives move hopes break come later frustrations build sides detectives phones rarely ring tips officers grow embittered witnesses cooperate meantime victim friends conduct investigations talk grabbing stash gun wheel well mother apartment find suspect chasm police community gangs gun violence flourish parents try protect families drug crews threats officers work overcome residue years mistrust understaffing communities still go racing one call next streets around st mary park scene two fatal shootings logged th precinct year unsolved james fernandez heard talk murders door apartment east th street betances houses lived end long hallway strewn hypodermic needles empty dope bags discarded hennessy bottles young men spoke subset bloods gang made drug market slinging marijuana cocaine regulars flashing firearms blowing smoke fernandez apartment mr fernandez asked young men move answered busting car kind crime anachronism much new york still rattles th precinct even though murders fallen year major felony crimes per resident residential district city also one poorest communities country many young men find way underground markets mr fernandez one shrink threats growing lower east side rode bicycle around customers drug dealers worked collected payments backpack leaving life got tech maintenance job three years ago moved betances houses wife daughter two choices get help drug crew call police help risk labeled snitch call old lower east side bosses muscle risk violence chose police walked local substation police service area asked protection daughter using inhaler relieve coughs marijuana smoke mr fernandez wife got terrible headaches lot killers going kill sergeant police report quoted telling mr fernandez august second report filed day said warned going shoot window mr fernandez told police teenagers names appear reports went home said one friends seen walk substation tried intimidate filing another report three days later propped bike door said open door say something body slam mr fernandez wife maria fernandez wrote slips paper used document hallway ruckus inadequate police response boys made comments easy target slap opened door made drug sale threatened beat fernandez family ones snitching notes say another complaint substation days first brought relief week later feeling desperate ms fernandez tried calling first substation one boys blew weed smoke door made threat attack police never came wrote notes tried th precinct station house next officers desk left standing public waiting area said making fear seen officers put worse danger months later said came door announced front teenagers complaint drug activity mr fernandez started work said police failed wired camera peephole record drugs guns footage hark back new york still much present precinct residents around morning sgt michael lopuzzo walks tall wooden doors th precinct station house cases land metal desk dead bodies known cause strip club brawls shooting victims hobbling hospital bring resistance every turn reminding earlier era city campaign haven got one single phone call putting right direction said sergeant lopuzzo head precinct detective squad one day summer worked answer email inquiry murder victim aunt killer caught people understand often detectives feel effects people turning police witnesses shout away doors neighbors know refuse talk people shot wounded bronx early september third victims refused cooperate period th precinct squad detectives closed three nonfatal shootings robbery cases part resistance stems preventive policing tactics like hallmark style former mayor michael bloomberg police commissioner raymond kelly near height strategy th precinct stops city stops officers used force frisks stops year people done nothing criminal precinct also one areas department flooded newest officers roll calls pressured generate numbers write tickets make arrests choice give summons young man playing park dark even officers done growing neighborhood need bring something today justify existence officer argenis rosado joined precinct said interview station house re small area day day re hammering community course community eventually going turn pressure warped way officers residents saw rookies ignore someone might drinking outside sitting stoop cops came time probably viewed community differently said hector espada veteran precinct wanted way somehow give someone summons feel like guys still civil conversation morale wilted aged station house alexander avenue mott haven officers felt pressure downgrade crime complaints make appear less serious several said interviews overlooked crime reports immigrants seen unlikely complain watched supervisors badger victims repeating stories hopes drop complaints practice downgrading complaints resulted disciplining officers precinct last year one string scandals left officers feeling overscrutinized problems also existed elsewhere four commanders precinct sent packing five years one officers found ticket fixing forgiving parking tickets friends another recorded giving guidance stop frisk black boys men ages officers fled commands others became reluctant take assignments proactive policing units like put situations street whenever walked doors precinct seemed like black cloud said russell lewis th like heaviness walked wanted hours minutes go home didn want get caught anything precinct covers two square miles dozen housing projects mean overflows people methadone clinics draw addicts around city lofts southern edge precinct presage wave gentrification even police department hired officers neighborhood policing counterterrorism officers th precinct said still rush calls shift number unchanged new police commissioner james neill said handling similar south bronx precinct years ago several dozen calls time waiting response residents know want police domestic problem helps hint weapon last year precinct drew number civilian complaints officer misconduct city lawsuits stemming police actions precinct trying improve morale new commanding officer deputy inspector brian hennessy cadre department calls neighborhood coordination officers patrol since last january part citywide effort mr neill mayor bill de blasio bring back beat cop unencumbered chasing every last call listen people concerns help investigations precinct made among gun arrests city officers said discretion resolve encounters without summons arrest one corner near school courtlandt avenue east st street long spawned complaints gunfire fights inspector hennessy officers painted graffiti swept drug paraphernalia summer people said first answer complaints years inspector acknowledged residue policing lingers perception really sticks said workload th precinct startling reveals gap detective squads equipped answer violent crime manhattan compared bronx brooklyn queens three precinct detectives carrying cases year many others loads high even though department advises violent precincts assigned homicide typically four days investigate dealing cases quieter precincts give detectives month little distraction investigate murder detectives th precinct handled average violent felonies year murders rapes felony assaults robberies contrast detective precinct southern end staten island carries nine cases detective precinct patrolling union square gramercy park handles detective precinct washington heights handles citywide median last year th violent crime cases per detective bronx whole precinct detective carried average violent felonies year compared manhattan brooklyn queens staten island rape cases robbery patterns later sent specialized units precinct detectives extensive initial work interview victims write reports process evidence precincts much manhattan whiter wealthier south bronx often property felonies like stolen laptops credit cards police say complex even accounting crimes th precinct heaviest caseloads overall crime per detective city michael palladino head detectives endowment association former bronx officer said staffing disparities affected department efforts build trust communities like south bronx witnesses make calculation said cooperate detectives much work won even get chance protect ll late retaliation comes sergeant lopuzzo turned prestigious post stay th precinct said squad worked tirelessly handle cases people every squad wanted detectives staffing needs counterterrorism units task forces created new deployment challenges across department fight army army wish said details police department assigns officers closely held constantly flux public minimal information personnel allocated presented times analysis confidential staffing data department chief detectives robert boyce vowed send detectives th precinct said department reassess deployment broadly troubled precincts said recent decision bring gang narcotics vice detectives command made easier shift personnel chief boyce said burdens detectives went beyond felony crimes include cases noted support precinct squads got centralized units focusing robberies gangs grand larcenies example major crime keeps pounding th precinct rates tenth percent lower even citywide crime dropped third period precinct detective squad shrank eight investigators years according staffing data obtained city council freedom information law request squad covering union square gramercy park crime dropped third period grew investigators th precinct given additional detective four investigators summer already missing three detectives illness reasons retired detectives skeptical community relations alone drive crime city last busiest precincts rather say police department dedicating resources providing sort robust investigative response seems standard manhattan crime manhattan solved said howard landesberg th precinct detective late outer boroughs like forgotten retired detectives said understaffing made harder solve crimes bronx brooklyn queens higher prevalence gang drug killings already saddled investigators cases people inclined cooperate detectives closed percent homicides manhattan percent staten island year compared percent bronx percent queens percent brooklyn last year homicides detectives cleared percent manhattan percent bronx percent queens percent staten island percent brooklyn culture police department worry manhattan said joseph giacalone former sergeant bronx cold case squad part money added de blasio came talked tale two cities done complete opposite said business usual bronx struggles extend prosecutions last five years prosecutors bronx declined prosecute violent felony cases anywhere else city rate conviction bronx routinely lowest city well ticked year surpass brooklyn rate november bronx prosecutors work streamline cases cases become even difficult win problem th precinct allowed defense lawyers attack credibility officers implicated said patrice shaughnessy spokeswoman bronx district attorney office district attorney darcel clark elected said statement judge bronx heard jurors impartial trust police tide mistrust sergeant lopuzzo detectives work hours straight fresh cases buy chinese takeout money murder suspect carry surveillance videos home hopes personal computers may enhance better squad computer buy urn homeless mother murdered son ashes box months killing seem like people glittering city paying attention th precinct homicide victims newly fatherless children go back school without therapist help victims families wander confused courthouse nearly miss appearance newspapers largely ignore killings people criminal pasts pushing priority lists chiefs police headquarters stuffy squad room detectives th precinct grapple inheritance government neglect meet mothers believe sons might never murdered city guidance counselor listened pleas help stay enrolled city housing worker fixed locks lights building detectives work alongside vicious system streets punishing police cooperators young men scan court paperwork prison looking names people turned one murder victim precinct year cast crew avoided arrested gang takedown believed cooperating longtime th precinct detective jeff meenagh said witness homicide case going testify went back neighborhood told anyone testified get deserve allies sergeant lopuzzo makes friendly long helped clear woman son robbery charge locating surveillance video proved robber mother started calling tips code name gun car example always refused testify cut ties year sergeant lopuzzo arrested son stabbing two people shooting new york city owns east th street buildings side james fernandez betances houses said reality ground different drug boss ran block october mr fernandez increasingly afraid fed mr fernandez wife went far give officers keys building door get whenever wanted showed videos offered access camera see happening hallway couple officers said needed supervisor permission others answered young men making threats officers occasionally stopped outside building causing young men scatter come inside mr fernandez said menacing worsened mr fernandez daughter harassed arrived home school grew distressed parents start seeing therapist mr fernandez made several complaints office borough president ruben diaz jr visited victim advocate district attorney office oct sent online note police commissioner office went proper channels help note said precincts failed us except officers helped us hands tied one else turn months video multiple crimes taking place extreme danger th psa won anything wrote referring local substation please need speak one authority local substation commander deputy inspector jerry sullivan bronx narcotics unit alerted complaints mr fernandez said never heard relied street instincts protect family made pleas man thought employing dealers hallway activity quieted briefly returned young men rented room woman apartment upstairs mr fernandez approached different man learned boss operation man agreed ask dealers calm even hired drug customer sweep hallway mr fernandez said two weeks later dealing harassment resumed went old lower east side bosses hired men trail wife daughter way building make sure made safely school times sat outside betances houses also bought two bulletproof vests find one small enough daughter faith city new york faith police faith politicians mr fernandez said thing know sure god re situation left defend family paying close attention happening hallway mr fernandez said learned details two recent homicides th precinct investigating calls help going nowhere said decided put greater risk talking tell police learned bending backward nobody even anything said going help ain going help last january new neighborhood coordination officer working residents betances houses ended arrests housing command inspector sullivan said chief boyce said silos gang narcotics detectives used work made responding complaints difficult recent restructuring remove obstacles one live like mr fernandez lived people dealing drugs outside apartment said mr fernandez complaints spur arrests two men hallway caught separately year shootings one mr fernandez named police report charged summer hitting officer metal folding chair firing three gunshots crowd court papers say held rikers island attempted murder charge late mr fernandez may moved family away'), (17285, 'tyrus wong bambi artist thwarted racial bias dies new york times', 'walt disney bambi opened critics praised spare haunting visual style vastly different anything disney done know film striking appearance created chinese immigrant artist took inspiration landscape paintings song dynasty extent contribution bambi remains mark film animation widely known decades like film title character artist tyrus wong weathered irrevocable separation mother hope making life america incarceration isolation rigorous interrogation still child years followed endured poverty discrimination chronic lack recognition work disney also fine art finding acclaim mr wong died friday hollywood studio artist painter printmaker calligrapher illustrator later years maker fantastical kites one celebrated artists th century marginalization long subject passed much career unknown general public artistic recognition mr wong find noteworthy fact among chinese immigrant men generation professional prospects largely limited menial jobs like houseboy laundryman trained painter mr wong leading figure modernist movement flourished california first second world wars work included group shows art institute chicago also featured picasso matisse paul klee staff artist hollywood studios drew storyboards made vibrant paintings detailed architectural illustrations helped director envision scene shot years work informed look animated pictures disney films warner brothers studios among sands iwo jima rebel without cause wild bunch dozens films worked bambi mr wong belatedly renowned truly involved every phase production john canemaker animator historian animation new york university said interview obituary march created art direction really never seen animation mr wong subject water paper paint sky major retrospective disney family museum san francisco museum windows overlook san francisco bay contemplate angel island nine decades earlier lone sought gain admission country adamantly want wong gen yeo name sometimes romanized wong gaing yoo born oct farming village guangdong province young child already exhibited love drawing encouraged father seeking better economic prospects gen yeo father embarked united states leaving mother sister behind gen yeo never see mother obliged travel false identities state affairs known among chinese immigrants paper son hope circumventing chinese exclusion act signed law president chester arthur act drastically curtailed number chinese people allowed enter country among earliest united states laws impose severe restrictions immigration unforeseen loophole opened form san francisco earthquake fire huge number municipal documents including birth immigration records destroyed many newly arrived chinese capitalized loss maintaining born san francisco fire united states citizens entitled bring relatives case gen yeo father paper sons posing relatives attuned deception united states immigration officials put chinese arrivals formidable inquisition ensure claimed questions came like gunfire direction village face many windows house house rice bin wide well deep trees village lakes shops name sponsoring relative interrogated separately answers match new arrival major mistake series smaller ones mean deportation stand chance passing aspirants memorized rigorous dossiers known coaching papers ensuing interrogation hard enough adults gen yeo undergo alone dec month sea wongs landed angel island immigration station elder mr wong traveling merchant named look get son look tai yow angel island considered ellis island west coast lisa see author gold mountain nonfiction chronicle family said interview however continued goal really different ellis island supposed welcoming angel island opened specifically keep chinese mr wong father previously lived united states look get able clear immigration quickly new arrival gen yeo detained island nearly month child among immigrants held scared half death cried mr wong recalled tyrus documentary directed pamela tom premiered every day miserable miserable hated place jan presence interpreter stenographer young gen yeo posing look tai yow interrogated three inspectors father already questioned gen yeo well prepared answered without error sacramento joined father schoolteacher americanized tai yow tyrus known tyrus wong ever soon afterward father son separated elder mr wong moved los angeles seek work reasons lost time take son tyrus lived sacramento boardinghouse attending elementary school two years later possibly tyrus traveled los angeles join father found work gambling den lived boardinghouse sandwiched butcher shop brothel school tyrus worked houseboy two pasadena families earning cents day first art teacher father trained nightly calligraphy dip brush water trace ghostly characters newspaper afford ink drawing paper tyrus junior high teacher noting drawing talent arranged summer scholarship otis art institute los angeles account indifferent student public school tyrus found calling institute otis college art design scholarship ended declined return junior high father scraped together tuition small fortune let stay otis youngest student studied least five years simultaneously working school janitor graduating long afterward father died leaving young mr wong entirely mr wong artist works progress administration creating paintings libraries public spaces friends including artist benji okubo founded oriental artists group los angeles organized exhibitions members work level exposure asian artists time mr wong newly married needing steady work joined disney creating thousands intermediate drawings bring animated sequences life asians novelty hollywood studios mr wong made keenly aware fact first disney later warner brothers one flung racial epithet another assumed sight worked company cafeteria affront job painstaking repetitive mr wong quickly work animation terrible use talents landscape artist painter mr canemaker said reprieve came late mr wong learned disney adapting bambi life woods novel austrian writer felix salten fawn whose mother killed hunter trying animate book disney reached impasse studio enjoyed great success animated film snow white seven dwarfs baroque production every detail backgrounds every petal every flower every leaf every tree meticulously represented attempt use similar style bambi found ornate backgrounds camouflaged deer forest creatures narrative centered mr wong spied chance said gee outdoor scenery recalled video interview years afterward adding said gee landscape painter invoking exquisite landscape paintings song dynasty rendered watercolors pastels series nature scenes moody lyrical atmospheric lush spare backgrounds subtly suggested stroke two brush walt disney went crazy said mr canemaker wrote mr wong book animation begins art lives disney inspirational sketch artists said love indefinite quality mysterious quality forest mr wong unofficially promoted rank inspirational sketch artist mr canemaker explained designer person went questions color lay something even influenced music special effects look drawings inspired people mr wong spent two years painting illustrations inform every aspect bambi throughout finished film lent brooding quality stark landscapes misty desaturated palette figures often seen silhouette influence unmistakable wake bitter employees strike year disney fired mr wong though chosen strike felt studio good mr canemaker said let go amid lingering climate resentments bambi mr wong name appears quite far credits mere background artist mr wong joined warner brothers working lent occasion studios retirement indignities endured confined studios trying buy house wife former ruth kim told property inquired sold month go back sign still mr wong recalled tyrus japanese attack pearl harbor december mr wong like many took wearing lapel button proclaiming heritage lest angry american beat street war permanently dispersed fledgling oriental artists group mr wong friend mr okubo sent tens thousands internment camp world war ii hadn happened think artists even artists name today ms see said little movement barely started split apart war mr wong became united states citizen also designed christmas cards hallmark painted elegant designs dinnerware sought collectors longtime resident sunland calif became retirement renowned kitemaker designing building hand coloring astonishing airworthy creations butterflies swallows whole flocks owls centipedes feet long streaked southern california sky like paint blue canvas last years ruth wong life ill dementia mr wong forsook work care death slowly began making art formal recognition influence bambi mr wong named disney legend honor whose previous recipients include fred macmurray julie andrews annette funicello bestowed walt disney company outstanding contributions retrospective work curated part ms see inaugural exhibition chinese american museum los angeles disney family museum retrospective water paper paint sky traveled museum chinese america lower manhattan mr wong death home sunland confirmed filmmaker ms tom survivors include three daughters kay fong wong kim wong two grandchildren daughters small mr wong encouraged make art father encouraged yet let coloring books reason simple want children constrained said lines laid others'), (17286, 'among deaths heavy toll pop music new york times', 'death may great equalizer isn necessarily evenhanded fields endeavor suffered mortal losses consider muhammad ali arnold palmer sports hollywood deaths carrie fisher debbie reynolds pop music world hands bleakest year start david bowie whose stage persona androgynous glam rocker dance pop star electronic experimentalist music year days old news came died cancer hinted time short lyrics final album released two days death otherwise gone great lengths hide illness public wish privacy ensured death appear come blue came another shock three months later prince accidentally overdosed painkiller collapsed elevator sprawling home studio near minneapolis death came indications one including prince rogers nelson seen coming energetic onstage ever holding otherwise healthy regimen successfully defied age sixth decade death leonard cohen hand rd year undoubtedly see coming shoulder went hesitate say merry way ever wise troubadour playing sellout crowds shrugging inevitable knowing dark finally overtake saying essentially another song hadn delivered enough jolts system closed year yet another death george michael sensation whose aura dimmed later years went bed never woke christmas pop music figures fell year many voices still embedded nicked vinyl grooves old records lot people bear throw roster included paul kantner jefferson airplane keith emerson greg lake emerson lake palmer glenn frey eagles maurice white earth wind fire leon russell piano pounder delta blues wail mountain man mass hair died merle haggard rugged country poet common man outlaw joined bluegrass legend ralph stanley guitar virtuoso practically glued elvis swiveling hips early days scotty moore george martin whose genius creative influence sounds john paul george ringo extension entire rock era hailed fifth beatle music stars fill arenas idols another stripe mighty athletes left scene figure among towering ali called greatest sports figure th century boxer combined power grace brains way ring never seen great athlete matters war race religion coursed life publicly turbulent way people hated refused drafted vietnam war decision cost heavyweight title people admired even loved principled stands high spirits lightning mind winking yes rhyming motormouth illness closed little contain certainly mere ropes around ring palmer transformational golf first media star gentleman game never quite began gathering army rolling greenswards leading charge shirt coming untucked cigarette dangling lips club weapon pressed attack entire generation postwar guys took game arnie women athletically blessed magnetically cool telegenically handsome somehow one said gordie howe mr hockey son saskatchewan prairie tore national hockey league hung skates died ralph branca trolley car conductor son living reminder one crushing mistake fastball bobby thomson decided national league pennant sometimes never lived pat summitt coach elevated women basketball led tennessee teams eight championships won games college coach defeat alzheimer disease dying within months national basketball association lost two giants different eras clyde lovelette olympic college champion transformed game one first truly big men hardwood heir nate thurmond defensive stalwart battled russell wilt kareem paint hall fame career even older baseball ranks monte irvin died people still around remember watching play particularly prime star negro circuit barred major leagues made hall fame anyway new york giant became major league baseball first black executive died fans pondered question hung many athletic career shackled discrimination different question entirely different sphere arose stunning news justice antonin scalia died hunting trip texas thick one consequential supreme court careers modern times left void conservative jurisprudence urgently vacancy bench yet filled raising still questions may await country exits public stage returned us past nancy reagan death evoked white house glamour west coast conservatism took residence banks potomac john glenn us thinking burst national pride soaring outer space deaths tom hayden daniel berrigan avatars defiance harked back student rebellions vietnam war roiling home front phyllis schlafly obituaries windows roots right wing ascension american politics death janet reno first woman serve attorney general recalled clinton years eight firestorm waco tex international tug war cuban boy named eli gonz lez bitter senate battle impeachment shores fidel castro death summoned memories cuban revolution nuclear brinkmanship enduring enmity strongman superpower miles away name boutros egyptian diplomat led united nations led replayed nightmares genocide rwanda bosnia death shimon peres removed last link founding israel conjured decades growing military power fitful strivings peace elie wiesel new york tireless struggle compel world never forget made us confront gas chambers auschwitz writers even fiction world poorer without literary voices harper lee umberto eco pat conroy jim harrison anita brookner alvin toffler gloria naylor william trevor mention playwrights peter shaffer dario fo edward albee dead treasured spun viewing pleasure none lustily ms fisher princess leia star wars tales day later capping year startling deaths ms reynolds singing acting leading lady earlier era died throes mother grief devotees harry potter movies saddened death alan rickman played deliciously dour professor severus snape blockbuster franchise whose career stage screen far richer many snape younger fans may known zsa zsa gabor celebrity contrast outshone modest acting career gene wilder garry shandling died year perfected brand hilariously neurotic comedy fit culture time abe vigoda godfather movies barney miller actually die actually done years ago wildly uninformed people spread word side camera directors whose vision came us parts jacques rivette french new wave auteur meditations life art abbas kiarostami iranian master searching examinations ordinary lives andrzej wajda rival ingmar bergman akira kurosawa critics eyes haunting tales poland boot first nazis communists long roster television stars generation two ago passed images younger selves frozen time noel neill adventures superman alan young mister ed robert vaughn man william schallert patty duke father daughter patty duke show dan haggerty life times grizzly adams florence henderson brady bunch alan thicke growing pains garry marshall creative force practically owned prime time happy days mork mindy laverne shirley died broadway lights dimmed memory brian bedford tammy grimes anne jackson brilliant day architect zaha hadid left behind monuments fertile imagination shaken acolytes around world street photographer bill cunningham found fashion statements every corner suddenly missing making manhattan overnight less idiosyncratic less interesting place smiling skinny man pedaling bicycle among honking cabs blue french worker jacket camera slung around neck picture split scene seemingly generation fellow photographers made art recording last half th century ruth gruber marc riboud louis stettner tv journalists morley safer gwen ifill tv commentator john mclaughlin tried make sense music precincts emptier without conductor revolutionary composer pierre boulez new music soprano phyllis curtin jazz artists mose allison bobby hutcherson gato barbieri rapper phife dawg malik taylor latin megastar juan gabriel silicon valley saw giant depart andrew grove led semiconductor revolution intel television industry lost executive grant tinker made nbc network watch prime time astrophysics smaller world women science said farewell pioneer champion vera rubin tens thousands people might choked death saved simple ingenious maneuver passing henry heimlich prompted sympathy even gratitude come think eliciting large silent thank live bad way anyone go brings us marion pritchard died inspired measures gratitude profound brave young dutch student gentile risked life save jews death camps early one instance shooting nazi stooge seize three little children hiding estimate saved people many still alive died weeks ago anyone guess know certain reasonably surmise good many still possession selfless gift matchless legacy lives'), (17287, 'kim jong un says north korea preparing test long range missile new york times', 'seoul south korea north korea leader kim said sunday country making final preparations conduct first test intercontinental ballistic missile bold statement less month inauguration donald trump although north korea conducted five nuclear tests last decade ballistic missile tests alone although habitually threatens attack united states nuclear weapons country never intercontinental ballistic missile icbm annual new year day speech broadcast north kctv sunday mr kim spoke proudly strides said country made nuclear weapons ballistic missile programs said north korea continue bolster weapons programs long united states remained hostile continued joint military exercises south korea reached final stage preparations intercontinental ballistic rocket said analysts region said mr kim might conduct another weapons test coming months taking advantage leadership changes united states south korea mr trump sworn jan south korea president park whose powers suspended parliamentary impeachment dec waiting constitutional court rule whether formally removed office reinstated north korea conducts test coming months test mr trump new administration despite years increasingly harsh sanctions north korea advancing toward mr kim professed goal arming isolated country ability deliver nuclear warhead united states mr kim speech sunday indicated north korea may rocket several times year complete icbm program said cheong senior research fellow sejong institute south korea first tests come even mr trump inauguration mr cheong said need take note fact first new year speech kim mentioned intercontinental ballistic missile said speech mr kim comment mr trump election doubt still runs deep north korea mastered technology needed build reliable icbm analysts region said north launchings rockets put satellites orbit recent years showed country cleared key technological hurdles north satellite launch february south korean defense officials said unha rocket used launch successfully reconfigured missile fly miles warhead pounds far enough reach united states north korea deployed rodong ballistic missiles reach south korea japan spotty record musudan ballistic missile range long enough reach american military bases pacific including guam north also claimed series successes testing various icbm technologies although claims verified often disputed officials analysts region said make nuclear warheads small enough fit onto ballistic missile also claimed success testing technology allows missile return earth atmosphere without breaking april north korea reported successful ground test engine intercontinental ballistic missile time mr kim said north tip intercontinental ballistic rockets powerful nuclear warheads keep cesspool evils earth including mainland within striking range sept north conducted fifth powerful nuclear test mr kim later attended another ground test new rocket engine exhorting government prepare another rocket launch soon possible november united nations security council imposed new sanctions north')]
# Inverted index import itertools import operator def accumulate(l): it = itertools.groupby(l, operator.itemgetter(0)) for key, subiter in it: yield key, sum(item[1] for item in subiter) inverted_index_rdd = articles_rdd.flatMap(lambda line: [(word , (line[0], 1)) for word in (line[1] + " " + line[2]).split(" ")]) \ .groupByKey() \ .map(lambda word: (word[0], list(word[1]))) \ .map(lambda lista: (lista[0], sorted(list(accumulate(lista[1])), key = lambda x: -x[1]))) \ .cache() inverted_index_rdd.take(5)
Out[3]:
[('sensitisation', [(145965, 1)]),
('eftersom', [(46868, 1)]),
('ehning', [(54324, 3)]),
('avis',
[(19086, 6),
(175175, 3),
(None, 2),
(21022, 1),
(21537, 1),
(22811, 1),
(23932, 1),
(43766, 1),
(46197, 1),
(52857, 1),
(75082, 1),
(80066, 1),
(189553, 1),
(197133, 1),
(134572, 1),
(135331, 1),
(140388, 1),
(142943, 1),
(147717, 1)]),
('usine', [(33345, 1)])]
toSearch = str(dbutils.widgets.get("word")) final_result = inverted_index_rdd.filter(lambda x, toSearch=toSearch: x[0] == toSearch) \ .flatMap(lambda result: result[1]) final_result_list = final_result.collect() print(final_result_list)
[(157218, 37), (21637, 35), (22019, 18), (80078, 18), (172524, 17), (None, 15), (152630, 14), (153519, 14), (81994, 12), (67320, 11), (20670, 10), (21956, 10), (52043, 10), (152370, 10), (152725, 10), (156506, 10), (190400, 10), (21938, 9), (25214, 9), (56559, 9), (167381, 9), (186035, 9), (202471, 9), (71258, 9), (21429, 8), (57134, 8), (164430, 8), (167357, 8), (171843, 8), (172671, 8), (191001, 8), (198944, 8), (None, 8), (146054, 8), (70542, 8), (57626, 7), (59310, 7), (78796, 7), (80442, 7), (156494, 7), (168775, 7), (173338, 7), (190398, 7), (190411, 7), (193556, 7), (203825, 7), (20237, 6), (25996, 6), (32497, 6), (52978, 6), (80649, 6), (80812, 6), (153549, 6), (153694, 6), (170202, 6), (170348, 6), (172988, 6), (191066, 6), (64052, 6), (71053, 6), (72473, 6), (19805, 5), (23303, 5), (28682, 5), (42312, 5), (45251, 5), (51834, 5), (52974, 5), (58884, 5), (59693, 5), (80713, 5), (81300, 5), (167206, 5), (170221, 5), (174602, 5), (186097, 5), (190277, 5), (191000, 5), (192082, 5), (192872, 5), (197685, 5), (198660, 5), (202520, 5), (146187, 5), (147477, 5), (71851, 5), (73163, 5), (18295, 4), (18559, 4), (20117, 4), (22325, 4), (22589, 4), (25974, 4), (28274, 4), (32957, 4), (35666, 4), (36914, 4), (38158, 4), (38201, 4), (39772, 4), (41606, 4), (41660, 4), (43784, 4), (45410, 4), (51474, 4), (52943, 4), (58069, 4), (59383, 4), (74685, 4), (80907, 4), (152863, 4), (152905, 4), (156136, 4), (172815, 4), (173090, 4), (174359, 4), (174408, 4), (174543, 4), (177288, 4), (185711, 4), (190909, 4), (190971, 4), (191014, 4), (191753, 4), (192135, 4), (193184, 4), (193809, 4), (146776, 4), (65437, 4), (22581, 3), (24478, 3), (24522, 3), (24533, 3), (25461, 3), (27903, 3), (29708, 3), (33004, 3), (39955, 3), (39996, 3), (41143, 3), (48866, 3), (55106, 3), (58238, 3), (58294, 3), (59380, 3), (59670, 3), (78780, 3), (78894, 3), (82116, 3), (153612, 3), (156373, 3), (157552, 3), (157851, 3), (163449, 3), (167092, 3), (167187, 3), (167263, 3), (168515, 3), (169825, 3), (170546, 3), (171526, 3), (172122, 3), (172737, 3), (174374, 3), (175176, 3), (180232, 3), (185351, 3), (185849, 3), (193964, 3), (194747, 3), (199005, 3), (201969, 3), (71186, 3), (19005, 2), (22030, 2), (23075, 2), (24520, 2), (24574, 2), (24949, 2), (24964, 2), (25312, 2), (25497, 2), (26009, 2), (26486, 2), (28681, 2), (28829, 2), (32112, 2), (32504, 2), (35813, 2), (38177, 2), (43641, 2), (44417, 2), (46435, 2), (47945, 2), (49012, 2), (52168, 2), (54096, 2), (54307, 2), (57396, 2), (58240, 2), (59339, 2), (59352, 2), (59381, 2), (59471, 2), (59488, 2), (73942, 2), (78611, 2), (80067, 2), (80648, 2), (80696, 2), (80786, 2), (81002, 2), (82558, 2), (155831, 2), (155980, 2), (156214, 2), (156220, 2), (156393, 2), (156522, 2), (161124, 2), (166746, 2), (167156, 2), (167626, 2), (169973, 2), (170166, 2), (171490, 2), (172051, 2), (172569, 2), (173382, 2), (174230, 2), (174254, 2), (175325, 2), (176479, 2), (180181, 2), (180543, 2), (182433, 2), (187963, 2), (189782, 2), (190713, 2), (193842, 2), (193998, 2), (194710, 2), (198008, 2), (198834, 2), (135149, 2), (139220, 2), (144164, 2), (145765, 2), (147445, 2), (148358, 2), (149787, 2), (150191, 2), (151722, 2), (71352, 2), (17366, 1), (17416, 1), (17430, 1), (17663, 1), (17719, 1), (18613, 1), (18655, 1), (18749, 1), (19116, 1), (19358, 1), (19410, 1), (19792, 1), (19815, 1), (19831, 1), (20146, 1), (20451, 1), (20523, 1), (20652, 1), (21257, 1), (21313, 1), (21408, 1), (21829, 1), (22043, 1), (22053, 1), (22202, 1), (22452, 1), (22477, 1), (22512, 1), (22563, 1), (22567, 1), (22728, 1), (22982, 1), (23117, 1), (23225, 1), (23311, 1), (23333, 1), (23675, 1), (24544, 1), (24553, 1), (24605, 1), (24954, 1), (25154, 1), (25363, 1), (25385, 1), (25502, 1), (25547, 1), (25569, 1), (26296, 1), (28100, 1), (28476, 1), (28974, 1), (29294, 1), (29967, 1), (30013, 1), (30756, 1), (31432, 1), (31809, 1), (32022, 1), (32044, 1), (32679, 1), (33742, 1), (34513, 1), (34820, 1), (34836, 1), (34888, 1), (34905, 1), (35030, 1), (35286, 1), (36547, 1), (36782, 1), (37199, 1), (37388, 1), (37487, 1), (37764, 1), (37819, 1), (37975, 1), (37980, 1), (38366, 1), (38457, 1), (38854, 1), (39137, 1), (39787, 1), (40124, 1), (40412, 1), (40519, 1), (41671, 1), (41804, 1), (42060, 1), (42180, 1), (42571, 1), (42894, 1), (44852, 1), (45183, 1), (45324, 1), (45721, 1), (46072, 1), (46489, 1), (46798, 1), (46971, 1), (47516, 1), (47698, 1), (48351, 1), (48597, 1), (49082, 1), (49130, 1), (49597, 1), (49808, 1), (50395, 1), (50720, 1), (52788, 1), (52904, 1), (52936, 1), (53471, 1), (54104, 1), (54106, 1), (54259, 1), (54271, 1), (55225, 1), (55363, 1), (55418, 1), (55578, 1), (56655, 1), (56657, 1), (56827, 1), (56930, 1), (57186, 1), (57427, 1), (58798, 1), (58876, 1), (59030, 1), (59159, 1), (59353, 1), (59386, 1), (59415, 1), (59678, 1), (59694, 1), (59792, 1), (59865, 1), (60113, 1), (60361, 1), (60808, 1), (74269, 1), (75324, 1), (75567, 1), (75872, 1), (75893, 1), (76139, 1), (76494, 1), (76649, 1), (76843, 1), (76908, 1), (77035, 1), (77358, 1), (77538, 1), (77804, 1), (78544, 1), (78598, 1), (79484, 1), (79633, 1), (79878, 1), (79971, 1), (80293, 1), (80744, 1), (80891, 1), (81003, 1), (81282, 1), (81306, 1), (81579, 1), (81593, 1), (81657, 1), (81839, 1), (81982, 1), (82230, 1), (82355, 1), (82390, 1), (82416, 1), (152323, 1), (152597, 1), (152709, 1), (152864, 1), (152906, 1), (153131, 1), (153160, 1), (153200, 1), (153381, 1), (153562, 1), (153881, 1), (154166, 1), (155720, 1), (155821, 1), (156127, 1), (157018, 1), (157019, 1), (157814, 1), (158015, 1), (158154, 1), (161313, 1), (161340, 1), (161486, 1), (161506, 1), (161526, 1), (161571, 1), (162374, 1), (162486, 1), (163425, 1), (163717, 1), (163757, 1), (164547, 1), (164659, 1), (165414, 1), (165788, 1), (166585, 1), (166628, 1), (166684, 1), (166771, 1), (166844, 1), (166901, 1), (166971, 1), (167046, 1), (167087, 1), (167142, 1), (167439, 1), (167492, 1), (167498, 1), (167535, 1), (167602, 1), (167883, 1), (168323, 1), (168468, 1), (168782, 1), (168877, 1), (169358, 1), (170117, 1), (170119, 1), (170158, 1), (170163, 1), (170232, 1), (171226, 1), (171513, 1), (172055, 1), (172087, 1), (172556, 1), (172938, 1), (172981, 1), (173089, 1), (173480, 1), (173612, 1), (173877, 1), (174547, 1), (174947, 1), (175052, 1), (175275, 1), (175663, 1), (175706, 1), (176212, 1), (176321, 1), (177585, 1), (177996, 1), (178788, 1), (179006, 1), (179054, 1), (179246, 1), (179739, 1), (180124, 1), (180583, 1), (180992, 1), (181097, 1), (181152, 1), (181304, 1), (182428, 1), (182539, 1), (182863, 1), (183094, 1), (183494, 1), (184058, 1), (185158, 1), (185227, 1), (185454, 1), (185722, 1), (185751, 1), (185883, 1), (186281, 1), (186468, 1), (186475, 1), (186555, 1), (187004, 1), (187049, 1), (187405, 1), (188200, 1), (188447, 1), (188978, 1), (189238, 1), (190289, 1), (190980, 1), (191137, 1), (191607, 1), (191881, 1), (192140, 1), (192198, 1), (192298, 1), (192394, 1), (193089, 1), (193225, 1), (193314, 1), (193486, 1), (193620, 1), (193631, 1), (194421, 1), (194476, 1), (194997, 1), (195231, 1), (195513, 1), (198756, 1), (198757, 1), (198932, 1), (199218, 1), (199327, 1), (199848, 1), (200032, 1), (200186, 1), (203569, 1), (203845, 1), (135384, 1), (135914, 1), (136354, 1), (138656, 1), (138991, 1), (141104, 1), (141704, 1), (142616, 1), (142784, 1), (144298, 1), (144480, 1), (144908, 1), (145861, 1), (146340, 1), (146508, 1), (146628, 1), (146976, 1), (147719, 1), (149311, 1), (149677, 1), (150425, 1), (150713, 1), (150957, 1), (151068, 1), (151663, 1), (151904, 1), (60894, 1), (62364, 1), (62504, 1), (63761, 1), (65023, 1), (65106, 1), (66311, 1), (66473, 1), (66484, 1), (67652, 1), (68158, 1), (69190, 1), (69862, 1), (70022, 1), (71134, 1), (71195, 1), (72121, 1), (72915, 1), (73174, 1)]
def printing_result(): cont = 0 maximum = 5 for i in final_result_list: if cont == maximum: break if i[0] != None: cont += 1 yield i[1], list(((k, v) for k, v in file_map.items() if k == i[0])) print(list(printing_result()))
[(37, [(157218, 'Peru 0-0 Colombia (Colombia win 4-2 on penalties): Copa América – as it happened')]), (35, [(21637, 'The Secret History of Colombia’s Paramilitaries and the U.S. War on Drugs - The New York Times')]), (18, [(22019, 'Colombian Opposition to Peace Deal Feeds Off Gay Rights Backlash - The New York Times')]), (18, [(80078, 'The End of Colombian Exceptionalism')]), (17, [(172524, 'Can Colombia Finally Fix Its Split Personality? ')])]
from collections import defaultdict def accumulate2(l): d = defaultdict(list) for k, *v in l: d[k].append(sum(v)) for k in d.keys(): yield k, len(d[k]) news_rdd = articles_rdd.flatMap(lambda line: [(line[0] , (word, 1)) for word in (line[1] + " " + line[2]).split(" ")]) \ .groupByKey() \ .map(lambda word: (word[0], list(word[1]))) \ .filter(lambda x: x[0] != None) \ .map(lambda lista: (lista[0], sorted(list(accumulate2(lista[1])), key = lambda x: -x[1]))) \ .cache() news_rdd.count()
Out[8]: 102038
from functools import reduce id_search = int(dbutils.widgets.get("search")) if not id_search in file_map or id_search == None: print("Not found") else: new_title = file_map[id_search] in_new_rdd = news_rdd.filter(lambda x, id_search=id_search: x[0] == id_search) in_new_list = in_new_rdd.collect() def news_similarity2(rdd_other_news): list1 = in_new_list[0][1] list2 = rdd_other_news[1] list3 = [] for value in list1: for v in list2: if value[0] == v[0]: list3.append(value[0]) union = list1 + list2 distance_list = list(filter((lambda x, list3=list3: x[0] in list3), union)) if len(distance_list) != 0: last_result = reduce(lambda a, b: (a[0], a[1] + b[1]) if a[0] != "" and b[0] != "" else 0, distance_list)[1] result = [rdd_other_news[0], last_result, len(list3)] else: result = [rdd_other_news[0], 0, 0] return result other_news = news_rdd.filter(lambda x, id_search=id_search: x[0] != id_search) sim_news = other_news.map(news_similarity2) \ .sortBy(lambda x: -x[1]) sim_news_df = sim_news.toDF(["id","similarity","words"]) display(sim_news_df)
Showing the first 222 rows.
Last refresh: Never