Vc 909 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996219972199821999220002200122002220032200422005220062200722008220092201022011220122201322014220152201622017220182201922020220212202222023220242202522026220272202822029220302203122032220332203422035220362203722038220392204022041220422204322044220452204622047220482204922050220512205222053220542205522056220572205822059220602206122062220632206422065220662206722068220692207022071220722207322074220752207622077220782207922080220812208222083220842208522086220872208822089220902209122092220932209422095220962209722098220992210022101221022210322104221052210622107221082210922110221112211222113221142211522116221172211822119221202212122122221232212422125221262212722128221292213022131221322213322134221352213622137221382213922140221412214222143221442214522146221472214822149221502215122152221532215422155221562215722158221592216022161221622216322164221652216622167221682216922170221712217222173221742217522176221772217822179221802218122182221832218422185221862218722188221892219022191221922219322194221952219622197221982219922200222012220222203222042220522206222072220822209222102221122212222132221422215222162221722218222192222022221222222222322224222252222622227222282222922230222312223222233222342223522236222372223822239222402224122242222432224422245222462224722248222492225022251222522225322254222552225622257222582225922260222612226222263222642226522266222672226822269222702227122272222732227422275222762227722278222792228022281222822228322284222852228622287222882228922290222912229222293222942229522296222972229822299223002230122302223032230422305223062230722308223092231022311223122231322314223152231622317223182231922320223212232222323223242232522326223272232822329223302233122332223332233422335223362233722338223392234022341223422234322344223452234622347223482234922350223512235222353223542235522356223572235822359223602236122362223632236422365223662236722368223692237022371223722237322374223752237622377223782237922380223812238222383223842238522386223872238822389223902239122392223932239422395223962239722398223992240022401224022240322404224052240622407224082240922410224112241222413224142241522416224172241822419224202242122422224232242422425224262242722428224292243022431224322243322434224352243622437224382243922440224412244222443224442244522446224472244822449224502245122452224532245422455224562245722458224592246022461224622246322464224652246622467224682246922470224712247222473224742247522476224772247822479224802248122482224832248422485224862248722488224892249022491224922249322494224952249622497224982249922500225012250222503225042250522506225072250822509225102251122512225132251422515225162251722518225192252022521225222252322524225252252622527225282252922530225312253222533225342253522536225372253822539225402254122542225432254422545225462254722548225492255022551225522255322554225552255622557225582255922560225612256222563225642256522566225672256822569225702257122572225732257422575225762257722578225792258022581225822258322584225852258622587225882258922590225912259222593225942259522596225972259822599226002260122602226032260422605226062260722608226092261022611226122261322614226152261622617226182261922620226212262222623226242262522626226272262822629226302263122632226332263422635226362263722638226392264022641226422264322644226452264622647226482264922650226512265222653226542265522656226572265822659226602266122662226632266422665226662266722668226692267022671226722267322674226752267622677226782267922680226812268222683226842268522686226872268822689226902269122692226932269422695226962269722698226992270022701227022270322704227052270622707227082270922710227112271222713227142271522716227172271822719227202272122722227232272422725227262272722728227292273022731227322273322734227352273622737227382273922740227412274222743227442274522746227472274822749227502275122752227532275422755227562275722758227592276022761227622276322764227652276622767227682276922770227712277222773227742277522776227772277822779227802278122782227832278422785227862278722788227892279022791227922279322794227952279622797227982279922800228012280222803228042280522806228072280822809228102281122812228132281422815228162281722818228192282022821228222282322824228252282622827228282282922830228312283222833228342283522836228372283822839228402284122842228432284422845228462284722848228492285022851228522285322854228552285622857228582285922860228612286222863228642286522866228672286822869228702287122872228732287422875228762287722878228792288022881228822288322884228852288622887228882288922890228912289222893228942289522896228972289822899229002290122902229032290422905229062290722908229092291022911229122291322914229152291622917229182291922920229212292222923229242292522926229272292822929229302293122932229332293422935229362293722938229392294022941229422294322944229452294622947229482294922950229512295222953229542295522956229572295822959229602296122962229632296422965229662296722968229692297022971229722297322974229752297622977229782297922980229812298222983229842298522986229872298822989229902299122992229932299422995229962299722998229992300023001230022300323004230052300623007230082300923010230112301223013230142301523016230172301823019230202302123022230232302423025
  1. #ifndef VC_VECTOR_H_
  2. #define VC_VECTOR_H_
  3. #ifndef VC_SCALAR_VECTOR_H_
  4. #define VC_SCALAR_VECTOR_H_
  5. #include <assert.h>
  6. #include <algorithm>
  7. #include <cmath>
  8. #ifdef _MSC_VER
  9. #include <float.h>
  10. #endif
  11. #ifndef VC_COMMON_TYPES_H_
  12. #define VC_COMMON_TYPES_H_
  13. #ifdef Vc_CHECK_ALIGNMENT
  14. #include <cstdlib>
  15. #include <cstdio>
  16. #endif
  17. #include <ratio>
  18. #ifndef VC_GLOBAL_H_
  19. #define VC_GLOBAL_H_
  20. #include <cstdint>
  21. #ifndef VC_FWDDECL_H_
  22. #define VC_FWDDECL_H_
  23. #include <cstddef>
  24. #define Vc_VERSIONED_NAMESPACE Vc_1
  25. namespace Vc_VERSIONED_NAMESPACE
  26. {
  27. namespace VectorAbi
  28. {
  29. struct Scalar {};
  30. struct Sse {};
  31. struct Avx {};
  32. struct Mic {};
  33. template <class T> struct DeduceCompatible;
  34. template <class T> struct DeduceBest;
  35. }
  36. namespace Common
  37. {
  38. template <class T, std::size_t N> struct select_best_vector_type;
  39. }
  40. template <class T, class Abi> class Mask;
  41. template <class T, class Abi> class Vector;
  42. template <class T, std::size_t N,
  43. class V = typename Common::select_best_vector_type<T, N>::type,
  44. std::size_t Wt = V::Size>
  45. class SimdArray;
  46. template <class T, std::size_t N,
  47. class V = typename Common::select_best_vector_type<T, N>::type,
  48. std::size_t Wt = V::Size>
  49. class SimdMaskArray;
  50. namespace simd_abi
  51. {
  52. using scalar = VectorAbi::Scalar;
  53. template <int N> struct fixed_size;
  54. template <class T> using compatible = typename VectorAbi::DeduceCompatible<T>::type;
  55. template <class T> using native = typename VectorAbi::DeduceBest<T>::type;
  56. using __sse = VectorAbi::Sse;
  57. using __avx = VectorAbi::Avx;
  58. struct __avx512;
  59. struct __neon;
  60. }
  61. template <class T, class Abi = simd_abi::compatible<T>> using simd = Vector<T, Abi>;
  62. template <class T, class Abi = simd_abi::compatible<T>> using simd_mask = Mask<T, Abi>;
  63. template <class T> using native_simd = simd<T, simd_abi::native<T>>;
  64. template <class T> using native_simd_mask = simd_mask<T, simd_abi::native<T>>;
  65. template <class T, int N> using fixed_size_simd = simd<T, simd_abi::fixed_size<N>>;
  66. template <class T, int N>
  67. using fixed_size_simd_mask = simd_mask<T, simd_abi::fixed_size<N>>;
  68. }
  69. #ifndef DOXYGEN
  70. namespace Vc = Vc_VERSIONED_NAMESPACE;
  71. #endif
  72. #endif
  73. #ifdef DOXYGEN
  74. #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
  75. #undef Vc_ICC
  76. #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
  77. #undef Vc_CLANG
  78. #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
  79. #undef Vc_APPLECLANG
  80. #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
  81. #define Vc_MSVC _MSC_FULL_VER
  82. #undef Vc_MSVC
  83. #else
  84. #ifdef __INTEL_COMPILER
  85. #define Vc_ICC __INTEL_COMPILER_BUILD_DATE
  86. #elif defined(__clang__) && defined(__apple_build_version__)
  87. #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
  88. #elif defined(__clang__)
  89. #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
  90. #elif defined(__GNUC__)
  91. #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
  92. #elif defined(_MSC_VER)
  93. #define Vc_MSVC _MSC_FULL_VER
  94. #else
  95. #define Vc_UNSUPPORTED_COMPILER 1
  96. #endif
  97. #if defined Vc_GCC && Vc_GCC >= 0x60000
  98. #define Vc_RESET_DIAGNOSTICS _Pragma("GCC diagnostic pop")
  99. #pragma GCC diagnostic push
  100. #pragma GCC diagnostic ignored "-Wignored-attributes"
  101. #else
  102. #define Vc_RESET_DIAGNOSTICS
  103. #endif
  104. #if defined Vc_ICC
  105. #pragma warning disable 2922
  106. #endif
  107. #if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900)
  108. # error "Vc requires support for C++11."
  109. #elif __cplusplus >= 201402L
  110. #define Vc_CXX14 1
  111. # if __cplusplus > 201700L
  112. #define Vc_CXX17 1
  113. # endif
  114. #endif
  115. #if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM)
  116. #define Vc_GNU_ASM 1
  117. #endif
  118. #ifdef Vc_GCC
  119. #define Vc_HAVE_MAX_ALIGN_T 1
  120. #elif !defined(Vc_CLANG) && !defined(Vc_ICC)
  121. #define Vc_HAVE_STD_MAX_ALIGN_T 1
  122. #endif
  123. #if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
  124. #define Vc_USE_BUILTIN_VECTOR_TYPES 1
  125. #endif
  126. #ifdef Vc_MSVC
  127. #define Vc_CDECL __cdecl
  128. #define Vc_VDECL __vectorcall
  129. #else
  130. #define Vc_CDECL
  131. #define Vc_VDECL
  132. #endif
  133. #define Scalar 0x00100000
  134. #define SSE 0x00200000
  135. #define SSE2 0x00300000
  136. #define SSE3 0x00400000
  137. #define SSSE3 0x00500000
  138. #define SSE4_1 0x00600000
  139. #define SSE4_2 0x00700000
  140. #define AVX 0x00800000
  141. #define AVX2 0x00900000
  142. #define XOP 0x00000001
  143. #define FMA4 0x00000002
  144. #define F16C 0x00000004
  145. #define POPCNT 0x00000008
  146. #define SSE4a 0x00000010
  147. #define FMA 0x00000020
  148. #define BMI2 0x00000040
  149. #define IMPL_MASK 0xFFF00000
  150. #define EXT_MASK 0x000FFFFF
  151. #ifdef Vc_MSVC
  152. # ifdef _M_IX86_FP
  153. # if _M_IX86_FP >= 1
  154. # ifndef __SSE__
  155. #define __SSE__ 1
  156. # endif
  157. # endif
  158. # if _M_IX86_FP >= 2
  159. # ifndef __SSE2__
  160. #define __SSE2__ 1
  161. # endif
  162. # endif
  163. # elif defined(_M_AMD64)
  164. # ifndef __SSE__
  165. #define __SSE__ 1
  166. # endif
  167. # ifndef __SSE2__
  168. #define __SSE2__ 1
  169. # endif
  170. # endif
  171. #endif
  172. #if defined Vc_ICC && !defined __POPCNT__
  173. # if defined __SSE4_2__ || defined __SSE4A__
  174. #define __POPCNT__ 1
  175. # endif
  176. #endif
  177. #ifdef VC_IMPL
  178. #error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'"
  179. #endif
  180. #ifndef Vc_IMPL
  181. # if defined(__AVX2__)
  182. #define Vc_IMPL_AVX2 1
  183. #define Vc_IMPL_AVX 1
  184. # elif defined(__AVX__)
  185. #define Vc_IMPL_AVX 1
  186. # else
  187. # if defined(__SSE4_2__)
  188. #define Vc_IMPL_SSE 1
  189. #define Vc_IMPL_SSE4_2 1
  190. # endif
  191. # if defined(__SSE4_1__)
  192. #define Vc_IMPL_SSE 1
  193. #define Vc_IMPL_SSE4_1 1
  194. # endif
  195. # if defined(__SSE3__)
  196. #define Vc_IMPL_SSE 1
  197. #define Vc_IMPL_SSE3 1
  198. # endif
  199. # if defined(__SSSE3__)
  200. #define Vc_IMPL_SSE 1
  201. #define Vc_IMPL_SSSE3 1
  202. # endif
  203. # if defined(__SSE2__)
  204. #define Vc_IMPL_SSE 1
  205. #define Vc_IMPL_SSE2 1
  206. # endif
  207. # if defined(Vc_IMPL_SSE)
  208. # else
  209. #define Vc_IMPL_Scalar 1
  210. # endif
  211. # endif
  212. # if !defined(Vc_IMPL_Scalar)
  213. # ifdef __FMA4__
  214. #define Vc_IMPL_FMA4 1
  215. # endif
  216. # ifdef __XOP__
  217. #define Vc_IMPL_XOP 1
  218. # endif
  219. # ifdef __F16C__
  220. #define Vc_IMPL_F16C 1
  221. # endif
  222. # ifdef __POPCNT__
  223. #define Vc_IMPL_POPCNT 1
  224. # endif
  225. # ifdef __SSE4A__
  226. #define Vc_IMPL_SSE4a 1
  227. # endif
  228. # ifdef __FMA__
  229. #define Vc_IMPL_FMA 1
  230. # endif
  231. # ifdef __BMI2__
  232. #define Vc_IMPL_BMI2 1
  233. # endif
  234. # endif
  235. #else
  236. # if (Vc_IMPL & IMPL_MASK) == AVX2
  237. #define Vc_IMPL_AVX2 1
  238. #define Vc_IMPL_AVX 1
  239. # elif (Vc_IMPL & IMPL_MASK) == AVX
  240. #define Vc_IMPL_AVX 1
  241. # elif (Vc_IMPL & IMPL_MASK) == Scalar
  242. #define Vc_IMPL_Scalar 1
  243. # elif (Vc_IMPL & IMPL_MASK) == SSE4_2
  244. #define Vc_IMPL_SSE4_2 1
  245. #define Vc_IMPL_SSE4_1 1
  246. #define Vc_IMPL_SSSE3 1
  247. #define Vc_IMPL_SSE3 1
  248. #define Vc_IMPL_SSE2 1
  249. #define Vc_IMPL_SSE 1
  250. # elif (Vc_IMPL & IMPL_MASK) == SSE4_1
  251. #define Vc_IMPL_SSE4_1 1
  252. #define Vc_IMPL_SSSE3 1
  253. #define Vc_IMPL_SSE3 1
  254. #define Vc_IMPL_SSE2 1
  255. #define Vc_IMPL_SSE 1
  256. # elif (Vc_IMPL & IMPL_MASK) == SSSE3
  257. #define Vc_IMPL_SSSE3 1
  258. #define Vc_IMPL_SSE3 1
  259. #define Vc_IMPL_SSE2 1
  260. #define Vc_IMPL_SSE 1
  261. # elif (Vc_IMPL & IMPL_MASK) == SSE3
  262. #define Vc_IMPL_SSE3 1
  263. #define Vc_IMPL_SSE2 1
  264. #define Vc_IMPL_SSE 1
  265. # elif (Vc_IMPL & IMPL_MASK) == SSE2
  266. #define Vc_IMPL_SSE2 1
  267. #define Vc_IMPL_SSE 1
  268. # elif (Vc_IMPL & IMPL_MASK) == SSE
  269. #define Vc_IMPL_SSE 1
  270. # if defined(__SSE4_2__)
  271. #define Vc_IMPL_SSE4_2 1
  272. # endif
  273. # if defined(__SSE4_1__)
  274. #define Vc_IMPL_SSE4_1 1
  275. # endif
  276. # if defined(__SSE3__)
  277. #define Vc_IMPL_SSE3 1
  278. # endif
  279. # if defined(__SSSE3__)
  280. #define Vc_IMPL_SSSE3 1
  281. # endif
  282. # if defined(__SSE2__)
  283. #define Vc_IMPL_SSE2 1
  284. # endif
  285. # elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a)
  286. #define Vc_IMPL_SSE3 1
  287. #define Vc_IMPL_SSE2 1
  288. #define Vc_IMPL_SSE 1
  289. # endif
  290. # if (Vc_IMPL & XOP)
  291. #define Vc_IMPL_XOP 1
  292. # endif
  293. # if (Vc_IMPL & FMA4)
  294. #define Vc_IMPL_FMA4 1
  295. # endif
  296. # if (Vc_IMPL & F16C)
  297. #define Vc_IMPL_F16C 1
  298. # endif
  299. # if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT)
  300. #define Vc_IMPL_POPCNT 1
  301. # endif
  302. # if (Vc_IMPL & SSE4a)
  303. #define Vc_IMPL_SSE4a 1
  304. # endif
  305. # if (Vc_IMPL & FMA)
  306. #define Vc_IMPL_FMA 1
  307. # endif
  308. # if (Vc_IMPL & BMI2)
  309. #define Vc_IMPL_BMI2 1
  310. # endif
  311. #undef Vc_IMPL
  312. #endif
  313. #ifdef __AVX__
  314. #define Vc_USE_VEX_CODING 1
  315. #endif
  316. #ifdef Vc_IMPL_AVX
  317. #define Vc_IMPL_SSE4_2 1
  318. #define Vc_IMPL_SSE4_1 1
  319. #define Vc_IMPL_SSSE3 1
  320. #define Vc_IMPL_SSE3 1
  321. #define Vc_IMPL_SSE2 1
  322. #define Vc_IMPL_SSE 1
  323. #endif
  324. #if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700
  325. # if defined(Vc_IMPL_AVX)
  326. # warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead."
  327. #undef Vc_IMPL_AVX
  328. # if defined(Vc_IMPL_AVX2)
  329. #undef Vc_IMPL_AVX2
  330. # endif
  331. # endif
  332. #endif
  333. # if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX)
  334. # error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value."
  335. # elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2)
  336. # error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
  337. # endif
  338. #undef Scalar
  339. #undef SSE
  340. #undef SSE2
  341. #undef SSE3
  342. #undef SSSE3
  343. #undef SSE4_1
  344. #undef SSE4_2
  345. #undef AVX
  346. #undef AVX2
  347. #undef XOP
  348. #undef FMA4
  349. #undef F16C
  350. #undef POPCNT
  351. #undef SSE4a
  352. #undef FMA
  353. #undef BMI2
  354. #undef IMPL_MASK
  355. #undef EXT_MASK
  356. #if defined Vc_IMPL_AVX2
  357. #define Vc_DEFAULT_IMPL_AVX2
  358. #elif defined Vc_IMPL_AVX
  359. #define Vc_DEFAULT_IMPL_AVX
  360. #elif defined Vc_IMPL_SSE
  361. #define Vc_DEFAULT_IMPL_SSE
  362. #elif defined Vc_IMPL_Scalar
  363. #define Vc_DEFAULT_IMPL_Scalar
  364. #else
  365. #error "Preprocessor logic broken. Please report a bug."
  366. #endif
  367. #endif
  368. namespace Vc_VERSIONED_NAMESPACE
  369. {
  370. typedef signed char int8_t;
  371. typedef unsigned char uint8_t;
  372. typedef signed short int16_t;
  373. typedef unsigned short uint16_t;
  374. typedef signed int int32_t;
  375. typedef unsigned int uint32_t;
  376. typedef signed long long int64_t;
  377. typedef unsigned long long uint64_t;
  378. enum MallocAlignment {
  379. AlignOnVector,
  380. AlignOnCacheline,
  381. AlignOnPage
  382. };
  383. enum Implementation : std::uint_least32_t {
  384. ScalarImpl,
  385. SSE2Impl,
  386. SSE3Impl,
  387. SSSE3Impl,
  388. SSE41Impl,
  389. SSE42Impl,
  390. AVXImpl,
  391. AVX2Impl,
  392. MICImpl,
  393. ImplementationMask = 0xfff
  394. };
  395. enum ExtraInstructions : std::uint_least32_t {
  396. Float16cInstructions = 0x01000,
  397. Fma4Instructions = 0x02000,
  398. XopInstructions = 0x04000,
  399. PopcntInstructions = 0x08000,
  400. Sse4aInstructions = 0x10000,
  401. FmaInstructions = 0x20000,
  402. VexInstructions = 0x40000,
  403. Bmi2Instructions = 0x80000,
  404. ExtraInstructionsMask = 0xfffff000u
  405. };
  406. template <unsigned int Features> struct ImplementationT {
  407. static constexpr Implementation current()
  408. {
  409. return static_cast<Implementation>(Features & ImplementationMask);
  410. }
  411. static constexpr bool is(Implementation impl)
  412. {
  413. return static_cast<unsigned int>(impl) == current();
  414. }
  415. static constexpr bool is_between(Implementation low, Implementation high)
  416. {
  417. return static_cast<unsigned int>(low) <= current() &&
  418. static_cast<unsigned int>(high) >= current();
  419. }
  420. static constexpr bool runs_on(unsigned int extraInstructions)
  421. {
  422. return (extraInstructions & Features & ExtraInstructionsMask) ==
  423. (Features & ExtraInstructionsMask);
  424. }
  425. };
  426. using CurrentImplementation = ImplementationT<
  427. #ifdef Vc_IMPL_Scalar
  428. ScalarImpl
  429. #elif defined(Vc_IMPL_AVX2)
  430. AVX2Impl
  431. #elif defined(Vc_IMPL_AVX)
  432. AVXImpl
  433. #elif defined(Vc_IMPL_SSE4_2)
  434. SSE42Impl
  435. #elif defined(Vc_IMPL_SSE4_1)
  436. SSE41Impl
  437. #elif defined(Vc_IMPL_SSSE3)
  438. SSSE3Impl
  439. #elif defined(Vc_IMPL_SSE3)
  440. SSE3Impl
  441. #elif defined(Vc_IMPL_SSE2)
  442. SSE2Impl
  443. #endif
  444. #ifdef Vc_IMPL_SSE4a
  445. + Vc::Sse4aInstructions
  446. #ifdef Vc_IMPL_XOP
  447. + Vc::XopInstructions
  448. #ifdef Vc_IMPL_FMA4
  449. + Vc::Fma4Instructions
  450. #endif
  451. #endif
  452. #endif
  453. #ifdef Vc_IMPL_POPCNT
  454. + Vc::PopcntInstructions
  455. #endif
  456. #ifdef Vc_IMPL_FMA
  457. + Vc::FmaInstructions
  458. #endif
  459. #ifdef Vc_IMPL_BMI2
  460. + Vc::Bmi2Instructions
  461. #endif
  462. #ifdef Vc_USE_VEX_CODING
  463. + Vc::VexInstructions
  464. #endif
  465. >;
  466. }
  467. #ifndef VC_VERSION_H_
  468. #define VC_VERSION_H_
  469. #define Vc_VERSION_STRING "1.4.1-dev"
  470. #define Vc_VERSION_NUMBER 0x010403
  471. #define Vc_VERSION_CHECK(major,minor,patch) ((major << 16) | (minor << 8) | (patch << 1))
  472. #define Vc_LIBRARY_ABI_VERSION 5
  473. #define Vc_IS_VERSION_2 (Vc_VERSION_NUMBER >= Vc_VERSION_CHECK(1, 70, 0))
  474. #define Vc_IS_VERSION_1 (Vc_VERSION_NUMBER < Vc_VERSION_CHECK(1, 70, 0))
  475. namespace Vc_VERSIONED_NAMESPACE
  476. {
  477. inline const char *versionString() { return Vc_VERSION_STRING; }
  478. constexpr unsigned int versionNumber() { return Vc_VERSION_NUMBER; }
  479. }
  480. #endif
  481. #endif
  482. #ifndef VC_TRAITS_TYPE_TRAITS_H_
  483. #define VC_TRAITS_TYPE_TRAITS_H_
  484. #include <type_traits>
  485. #ifndef VC_TRAITS_DECAY_H_
  486. #define VC_TRAITS_DECAY_H_
  487. namespace Vc_VERSIONED_NAMESPACE
  488. {
  489. namespace Traits
  490. {
  491. template <typename T> using decay = typename std::decay<T>::type;
  492. }
  493. }
  494. #endif
  495. #ifndef VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
  496. #define VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
  497. #include <array>
  498. namespace Vc_VERSIONED_NAMESPACE
  499. {
  500. namespace Traits
  501. {
  502. template<typename T> struct has_no_allocated_data_impl : public std::false_type {};
  503. template <typename T>
  504. struct has_no_allocated_data
  505. : public has_no_allocated_data_impl<
  506. typename std::remove_cv<typename std::remove_reference<T>::type>::type>
  507. {
  508. };
  509. template<typename T, std::size_t N> struct has_no_allocated_data_impl<std::array<T, N>> : public std::true_type {};
  510. template<typename T, std::size_t N> struct has_no_allocated_data_impl<T[N]> : public std::true_type {};
  511. template<typename T> struct has_no_allocated_data_impl<T[]> : public std::true_type {};
  512. }
  513. }
  514. #endif
  515. #ifndef VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
  516. #define VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
  517. #include <initializer_list>
  518. #include <memory>
  519. #ifdef _LIBCPP_BEGIN_NAMESPACE_STD
  520. _LIBCPP_BEGIN_NAMESPACE_STD
  521. #else
  522. namespace std
  523. {
  524. #endif
  525. #ifdef _WIN32
  526. template <typename T, size_t N> class array;
  527. #else
  528. template <typename T, size_t N> struct array;
  529. #endif
  530. template <typename T, typename Allocator> class vector;
  531. #ifdef _LIBCPP_END_NAMESPACE_STD
  532. _LIBCPP_END_NAMESPACE_STD
  533. #else
  534. }
  535. #endif
  536. namespace Vc_VERSIONED_NAMESPACE
  537. {
  538. namespace Traits
  539. {
  540. namespace has_contiguous_storage_detail
  541. {
  542. template <typename T, typename It = typename T::iterator>
  543. std::is_base_of<std::random_access_iterator_tag,
  544. typename std::iterator_traits<It>::iterator_category>
  545. test(int);
  546. template <typename T>
  547. std::is_base_of<std::random_access_iterator_tag,
  548. typename std::iterator_traits<T>::iterator_category>
  549. test(long);
  550. template <typename T> std::false_type test(...);
  551. }
  552. template <typename T>
  553. struct has_contiguous_storage_impl
  554. : public decltype(has_contiguous_storage_detail::test<T>(int())) {
  555. };
  556. template <typename T>
  557. struct has_contiguous_storage
  558. : public has_contiguous_storage_impl<
  559. typename std::remove_cv<typename std::remove_reference<T>::type>::type>
  560. {
  561. };
  562. template <typename T> struct has_contiguous_storage_impl<const T *> : public std::true_type {};
  563. template <typename T> struct has_contiguous_storage_impl<T *> : public std::true_type {};
  564. template <typename T> struct has_contiguous_storage_impl<std::unique_ptr<T[]>> : public std::true_type {};
  565. template <typename T> struct has_contiguous_storage_impl<std::initializer_list<T>> : public std::true_type {};
  566. template <typename T, std::size_t N> struct has_contiguous_storage_impl<T[N]> : public std::true_type {};
  567. template <typename T, std::size_t N> struct has_contiguous_storage_impl<std::array<T, N>> : public std::true_type {};
  568. template <typename T, typename A> struct has_contiguous_storage_impl<std::vector<T, A>> : public std::true_type {};
  569. }
  570. }
  571. #endif
  572. #ifndef VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
  573. #define VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
  574. namespace Vc_VERSIONED_NAMESPACE
  575. {
  576. namespace Traits
  577. {
  578. namespace is_functor_argument_immutable_impl
  579. {
  580. template <typename F, typename A> std::true_type test(void (F::*)(A));
  581. template <typename F, typename A> std::true_type test(void (F::*)(A) const);
  582. template <typename F, typename A> std::is_const<A> test(void (F::*)(A &));
  583. template <typename F, typename A> std::is_const<A> test(void (F::*)(A &) const);
  584. template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&));
  585. template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&) const);
  586. struct dummy {};
  587. template <
  588. typename F, typename A,
  589. #ifdef Vc_MSVC
  590. #define Vc_TEMPLATE_
  591. #else
  592. #define Vc_TEMPLATE_ template
  593. #endif
  594. typename MemberPtr = decltype(&F::Vc_TEMPLATE_ operator()<A>)>
  595. decltype(is_functor_argument_immutable_impl::test(std::declval<MemberPtr>())) test2(int);
  596. #undef Vc_TEMPLATE_
  597. template <typename F, typename A>
  598. decltype(
  599. is_functor_argument_immutable_impl::test(std::declval<decltype(&F::operator())>()))
  600. test2(float);
  601. template <typename A> std::true_type test3(void(*)(A));
  602. template <typename A> std::is_const<A> test3(void(*)(A &));
  603. template <typename A> std::is_const<A> test3(void(*)(A &&));
  604. }
  605. template <typename F, typename A, bool = std::is_function<F>::value>
  606. struct is_functor_argument_immutable;
  607. template <typename F, typename A>
  608. struct is_functor_argument_immutable<F, A, false>
  609. : decltype(is_functor_argument_immutable_impl::test2<
  610. typename std::remove_reference<F>::type, A>(int())) {
  611. };
  612. template <typename F, typename A>
  613. struct is_functor_argument_immutable<F, A, true>
  614. : decltype(is_functor_argument_immutable_impl::test3(std::declval<F>())) {
  615. };
  616. }
  617. }
  618. #endif
  619. #ifndef VC_TRAITS_IS_OUTPUT_ITERATOR_H_
  620. #define VC_TRAITS_IS_OUTPUT_ITERATOR_H_
  621. #include <iterator>
  622. namespace Vc_VERSIONED_NAMESPACE
  623. {
  624. namespace Traits
  625. {
  626. namespace is_output_iterator_impl
  627. {
  628. template <typename T, typename ValueType = typename std::iterator_traits<T>::value_type,
  629. typename = decltype(*std::declval<T &>() = std::declval<
  630. ValueType>())
  631. >
  632. std::true_type test(int);
  633. template <typename T> std::false_type test(...);
  634. }
  635. template <typename T>
  636. struct is_output_iterator
  637. : public std::conditional<
  638. std::is_void<typename std::iterator_traits<T>::value_type>::value,
  639. std::true_type, decltype(is_output_iterator_impl::test<T>(int()))>::type
  640. {
  641. };
  642. static_assert(!std::is_void<std::iterator_traits<int *>::value_type>::value, "");
  643. static_assert(is_output_iterator<int *>::value, "");
  644. static_assert(!is_output_iterator<const int *>::value, "");
  645. }
  646. }
  647. #endif
  648. #ifndef VC_IS_INDEX_SEQUENCE_H_
  649. #define VC_IS_INDEX_SEQUENCE_H_
  650. #ifndef VC_COMMON_INDEXSEQUENCE_H_
  651. #define VC_COMMON_INDEXSEQUENCE_H_
  652. namespace Vc_VERSIONED_NAMESPACE
  653. {
  654. template <std::size_t... I> struct index_sequence
  655. {
  656. static constexpr std::size_t size() noexcept { return sizeof...(I); }
  657. };
  658. template <std::size_t N> struct make_index_sequence_impl {
  659. template <std::size_t Offset, std::size_t... Ns>
  660. static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
  661. index_sequence<Ns...>);
  662. template <std::size_t Offset, std::size_t... Ns>
  663. static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
  664. std::true_type, index_sequence<Ns...>);
  665. using is_odd = std::integral_constant<bool, N & 1>;
  666. using half = typename make_index_sequence_impl<N / 2>::type;
  667. using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
  668. };
  669. template <> struct make_index_sequence_impl<0> {
  670. using type = index_sequence<>;
  671. };
  672. template <> struct make_index_sequence_impl<1> {
  673. using type = index_sequence<0>;
  674. };
  675. template <> struct make_index_sequence_impl<2> {
  676. using type = index_sequence<0, 1>;
  677. };
  678. template <std::size_t N>
  679. using make_index_sequence = typename make_index_sequence_impl<N>::type;
  680. }
  681. #endif
  682. namespace Vc_VERSIONED_NAMESPACE
  683. {
  684. namespace Traits
  685. {
  686. template <typename T> struct is_index_sequence : public std::false_type {};
  687. template <std::size_t... I>
  688. struct is_index_sequence<Vc::index_sequence<I...>> : public std::true_type {};
  689. static_assert(!is_index_sequence<int>::value, "");
  690. static_assert(is_index_sequence<make_index_sequence<2>>::value, "");
  691. }
  692. }
  693. #endif
  694. #ifndef VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
  695. #define VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
  696. namespace Vc_VERSIONED_NAMESPACE
  697. {
  698. namespace Traits
  699. {
  700. template <typename From, typename To, bool = std::is_integral<From>::value>
  701. struct is_implicit_cast_allowed
  702. : public std::integral_constant<
  703. bool, std::is_same<From, To>::value ||
  704. (std::is_integral<To>::value &&
  705. (std::is_same<typename std::make_unsigned<From>::type, To>::value ||
  706. std::is_same<typename std::make_signed<From>::type, To>::value))> {
  707. };
  708. template <typename From, typename To>
  709. struct is_implicit_cast_allowed<From, To, false> : public std::is_same<From, To>::type {
  710. };
  711. template <typename From, typename To>
  712. struct is_implicit_cast_allowed_mask : public is_implicit_cast_allowed<From, To> {
  713. };
  714. }
  715. }
  716. #endif
  717. namespace Vc_VERSIONED_NAMESPACE
  718. {
  719. struct enable_if_default_type
  720. {
  721. constexpr enable_if_default_type() {}
  722. };
  723. static constexpr enable_if_default_type nullarg;
  724. template <bool Test, typename T = enable_if_default_type> using enable_if = typename std::enable_if<Test, T>::type;
  725. template <bool B, class T, class F>
  726. using conditional_t = typename std::conditional<B, T, F>::type;
  727. template <class T>
  728. using remove_cvref_t =
  729. typename std::remove_cv<typename std::remove_reference<T>::type>::type;
  730. namespace Traits
  731. {
  732. #ifndef VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
  733. #define VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
  734. namespace has_subscript_operator_impl
  735. {
  736. template <typename T, typename I, typename = decltype(std::declval<T &>()[std::declval<I>()])> std::true_type test(int);
  737. template <typename T, typename I> std::false_type test(float);
  738. }
  739. template <typename T, typename I = std::size_t>
  740. struct has_subscript_operator : public decltype(has_subscript_operator_impl::test<T, I>(1))
  741. {
  742. };
  743. #endif
  744. #ifndef VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
  745. #define VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
  746. namespace has_multiply_operator_impl
  747. {
  748. template <typename T, typename U, typename = decltype(std::declval<T>() * std::declval<U>())> std::true_type test(int);
  749. template <typename T, typename U> std::false_type test(...);
  750. }
  751. template <typename T, typename U = T>
  752. struct has_multiply_operator : public decltype(has_multiply_operator_impl::test<T, U>(1))
  753. {
  754. };
  755. #endif
  756. #ifndef VC_TRAITS_HAS_ADDITION_OPERATOR_H_
  757. #define VC_TRAITS_HAS_ADDITION_OPERATOR_H_
  758. namespace has_addition_operator_impl
  759. {
  760. template <typename T, typename U, typename = decltype(std::declval<T>() + std::declval<U>())> std::true_type test(int);
  761. template <typename T, typename U> std::false_type test(...);
  762. }
  763. template <typename T, typename U = T>
  764. struct has_addition_operator : public decltype(has_addition_operator_impl::test<T, U>(1))
  765. {
  766. };
  767. #endif
  768. #ifndef VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
  769. #define VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
  770. namespace has_equality_operator_impl
  771. {
  772. template <typename T, typename U,
  773. typename = enable_if<!std::is_same<void, decltype(std::declval<T>() == std::declval<U>())>::value>>
  774. std::true_type test(int);
  775. template <typename T, typename U> std::false_type test(...);
  776. }
  777. template <typename T, typename U = T>
  778. struct has_equality_operator : public decltype(has_equality_operator_impl::test<T, U>(1))
  779. {
  780. };
  781. #endif
  782. template<typename T> struct is_valid_vector_argument : public std::false_type {};
  783. template <> struct is_valid_vector_argument<double> : public std::true_type {};
  784. template <> struct is_valid_vector_argument<float> : public std::true_type {};
  785. template <> struct is_valid_vector_argument<int> : public std::true_type {};
  786. template <> struct is_valid_vector_argument<unsigned int> : public std::true_type {};
  787. template <> struct is_valid_vector_argument<short> : public std::true_type {};
  788. template <> struct is_valid_vector_argument<unsigned short> : public std::true_type {};
  789. template<typename T> struct is_simd_mask_internal : public std::false_type {};
  790. template<typename T> struct is_simd_vector_internal : public std::false_type {};
  791. template<typename T> struct is_simdarray_internal : public std::false_type {};
  792. template<typename T> struct is_simd_mask_array_internal : public std::false_type {};
  793. template<typename T> struct is_loadstoreflag_internal : public std::false_type {};
  794. template <typename T, bool = is_simd_vector_internal<T>::value> struct is_integral_internal;
  795. template <typename T, bool = is_simd_vector_internal<T>::value> struct is_floating_point_internal;
  796. template <typename T, bool = is_simd_vector_internal<T>::value> struct is_signed_internal;
  797. template <typename T, bool = is_simd_vector_internal<T>::value> struct is_unsigned_internal;
  798. template <typename T> struct is_integral_internal <T, false> : public std::is_integral <T> {};
  799. template <typename T> struct is_floating_point_internal<T, false> : public std::is_floating_point<T> {};
  800. template <typename T> struct is_signed_internal <T, false> : public std::is_signed <T> {};
  801. template <typename T> struct is_unsigned_internal <T, false> : public std::is_unsigned <T> {};
  802. template <typename V> struct is_integral_internal <V, true> : public std::is_integral <typename V::EntryType> {};
  803. template <typename V> struct is_floating_point_internal<V, true> : public std::is_floating_point<typename V::EntryType> {};
  804. template <typename V> struct is_signed_internal <V, true> : public std::is_signed <typename V::EntryType> {};
  805. template <typename V> struct is_unsigned_internal <V, true> : public std::is_unsigned <typename V::EntryType> {};
  806. template <typename T>
  807. struct is_arithmetic_internal
  808. : public std::integral_constant<
  809. bool,
  810. (is_floating_point_internal<T>::value || is_integral_internal<T>::value)>
  811. {
  812. };
  813. template <class T, class = void>
  814. struct vector_size_internal : std::integral_constant<std::size_t, 0> {
  815. };
  816. template <class T>
  817. struct vector_size_internal<T, decltype((void)(T::size() > 0))>
  818. : std::integral_constant<std::size_t, T::size()> {
  819. };
  820. template <typename T>
  821. struct is_simd_mask : public std::integral_constant<bool,
  822. (is_simd_mask_internal<decay<T>>::value ||
  823. is_simd_mask_array_internal<decay<T>>::value)>
  824. {
  825. };
  826. template <typename T>
  827. struct is_simd_vector
  828. : public std::integral_constant<bool,
  829. (is_simd_vector_internal<decay<T>>::value ||
  830. is_simdarray_internal<decay<T>>::value)>
  831. {
  832. };
  833. template <typename T>
  834. struct isSimdArray : public is_simdarray_internal<decay<T>>
  835. {
  836. };
  837. template <typename T>
  838. struct isSimdMaskArray : public is_simd_mask_array_internal<decay<T>>
  839. {
  840. };
  841. template <typename T> struct is_load_store_flag : public is_loadstoreflag_internal<decay<T>> {};
  842. template <typename T> struct is_atomic_simdarray_internal : public std::false_type {};
  843. template <typename T> using isAtomicSimdArray = is_atomic_simdarray_internal<decay<T>>;
  844. template <typename T> struct is_atomic_simd_mask_array_internal : public std::false_type {};
  845. template <typename T> using isAtomicSimdMaskArray = is_atomic_simd_mask_array_internal<decay<T>>;
  846. template <typename T> struct simd_vector_size : public vector_size_internal<decay<T>> {};
  847. template <typename T> struct is_integral : public is_integral_internal<decay<T>> {};
  848. template <typename T> struct is_floating_point : public is_floating_point_internal<decay<T>> {};
  849. template <typename T> struct is_arithmetic : public is_arithmetic_internal<decay<T>> {};
  850. template <typename T> struct is_signed : public is_signed_internal<decay<T>> {};
  851. template <typename T> struct is_unsigned : public is_unsigned_internal<decay<T>> {};
  852. template <typename T, bool IsSimdVector> struct scalar_type_internal { using type = T; };
  853. template <typename T> struct scalar_type_internal<T, true> { using type = typename T::EntryType; };
  854. template <typename T> using scalar_type = typename scalar_type_internal<decay<T>, is_simd_vector<T>::value>::type;
  855. }
  856. }
  857. #ifndef VC_TRAITS_ENTRY_TYPE_OF_H_
  858. #define VC_TRAITS_ENTRY_TYPE_OF_H_
  859. namespace Vc_VERSIONED_NAMESPACE
  860. {
  861. namespace Traits
  862. {
  863. namespace entry_type_of_internal
  864. {
  865. template <typename T, bool = Traits::is_simd_vector<T>::value> struct entry_type;
  866. template <typename T> struct entry_type<T, true>
  867. {
  868. using type = typename decay<T>::EntryType;
  869. };
  870. template <typename T> struct entry_type<T, false>
  871. {
  872. using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
  873. };
  874. }
  875. template <typename T> using entry_type_of = typename entry_type_of_internal::entry_type<T>::type;
  876. }
  877. }
  878. #endif
  879. #endif
  880. #ifndef VC_COMMON_PERMUTATION_H_
  881. #define VC_COMMON_PERMUTATION_H_
  882. #ifndef VC_COMMON_MACROS_H_
  883. #define VC_COMMON_MACROS_H_
  884. #ifdef Vc_MSVC
  885. #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
  886. typedef __declspec(align(n_)) type_ new_type_
  887. #elif __GNUC__
  888. #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
  889. typedef type_ new_type_[[gnu::aligned(n_)]]
  890. #else
  891. #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
  892. using new_type_ alignas(sizeof(n_)) = type_
  893. #endif
  894. #ifdef WIN32
  895. #define NOMINMAX 1
  896. #if defined min
  897. #undef min
  898. #endif
  899. #if defined max
  900. #undef max
  901. #endif
  902. #endif
  903. #if defined Vc_GCC && Vc_GCC >= 0x60000
  904. #define Vc_TEMPLATES_DROP_ATTRIBUTES 1
  905. #endif
  906. #if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000)
  907. #define Vc_RECURSIVE_MEMORY 1
  908. #endif
  909. #if defined Vc_CLANG || defined Vc_APPLECLANG
  910. #define Vc_UNREACHABLE __builtin_unreachable
  911. #define Vc_NEVER_INLINE [[gnu::noinline]]
  912. #define Vc_INTRINSIC_L inline
  913. #define Vc_INTRINSIC_R __attribute__((always_inline))
  914. #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
  915. #define Vc_FLATTEN
  916. #define Vc_CONST __attribute__((const))
  917. #define Vc_CONST_L
  918. #define Vc_CONST_R Vc_CONST
  919. #define Vc_PURE __attribute__((pure))
  920. #define Vc_PURE_L
  921. #define Vc_PURE_R Vc_PURE
  922. #define Vc_MAY_ALIAS __attribute__((may_alias))
  923. #define Vc_ALWAYS_INLINE_L inline
  924. #define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
  925. #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
  926. #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
  927. #define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
  928. #define Vc_RESTRICT __restrict__
  929. #define Vc_DEPRECATED(msg)
  930. #define Vc_DEPRECATED_ALIAS(msg)
  931. #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
  932. #elif defined(__GNUC__)
  933. #define Vc_UNREACHABLE __builtin_unreachable
  934. # if defined Vc_GCC && !defined __OPTIMIZE__
  935. #define Vc_MAY_ALIAS
  936. # else
  937. #define Vc_MAY_ALIAS __attribute__((__may_alias__))
  938. # endif
  939. #define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
  940. #define Vc_INTRINSIC_L inline
  941. #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
  942. #define Vc_FLATTEN __attribute__((__flatten__))
  943. #define Vc_ALWAYS_INLINE_L inline
  944. #define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
  945. #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
  946. # ifdef Vc_ICC
  947. #define Vc_PURE
  948. #define Vc_CONST
  949. #define Vc_NEVER_INLINE
  950. # else
  951. #define Vc_NEVER_INLINE [[gnu::noinline]]
  952. #define Vc_PURE __attribute__((__pure__))
  953. #define Vc_CONST __attribute__((__const__))
  954. # endif
  955. #define Vc_CONST_L
  956. #define Vc_CONST_R Vc_CONST
  957. #define Vc_PURE_L
  958. #define Vc_PURE_R Vc_PURE
  959. #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
  960. #define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
  961. #define Vc_RESTRICT __restrict__
  962. # ifdef Vc_ICC
  963. #define Vc_DEPRECATED(msg)
  964. #define Vc_DEPRECATED_ALIAS(msg)
  965. # else
  966. #define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
  967. #define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
  968. # endif
  969. #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
  970. #else
  971. #define Vc_NEVER_INLINE
  972. #define Vc_FLATTEN
  973. # ifdef Vc_PURE
  974. #undef Vc_PURE
  975. # endif
  976. #define Vc_MAY_ALIAS
  977. # ifdef Vc_MSVC
  978. #define Vc_ALWAYS_INLINE inline __forceinline
  979. #define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
  980. #define Vc_ALWAYS_INLINE_R
  981. #define Vc_CONST __declspec(noalias)
  982. #define Vc_CONST_L Vc_CONST
  983. #define Vc_CONST_R
  984. #define Vc_PURE
  985. #define Vc_PURE_L Vc_PURE
  986. #define Vc_PURE_R
  987. #define Vc_INTRINSIC inline __forceinline
  988. #define Vc_INTRINSIC_L Vc_INTRINSIC
  989. #define Vc_INTRINSIC_R
  990. namespace Vc_VERSIONED_NAMESPACE {
  991. namespace detail
  992. {
  993. static Vc_INTRINSIC void unreachable() { __assume(0); }
  994. }
  995. }
  996. #define Vc_UNREACHABLE Vc::detail::unreachable
  997. # else
  998. #define Vc_ALWAYS_INLINE
  999. #define Vc_ALWAYS_INLINE_L
  1000. #define Vc_ALWAYS_INLINE_R
  1001. #define Vc_CONST
  1002. #define Vc_CONST_L
  1003. #define Vc_CONST_R
  1004. #define Vc_PURE
  1005. #define Vc_PURE_L
  1006. #define Vc_PURE_R
  1007. #define Vc_INTRINSIC
  1008. #define Vc_INTRINSIC_L
  1009. #define Vc_INTRINSIC_R
  1010. #define Vc_UNREACHABLE std::abort
  1011. # endif
  1012. #define Vc_IS_UNLIKELY(x) x
  1013. #define Vc_IS_LIKELY(x) x
  1014. #define Vc_RESTRICT __restrict
  1015. #define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
  1016. #define Vc_DEPRECATED_ALIAS(msg)
  1017. #define Vc_WARN_UNUSED_RESULT
  1018. #endif
  1019. #ifdef Vc_CXX14
  1020. #undef Vc_DEPRECATED
  1021. #define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
  1022. #endif
  1023. #define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")
  1024. #define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \
  1025. \
  1026. \
  1027. \
  1028. Vc_ALWAYS_INLINE void *operator new(size_t size) \
  1029. { \
  1030. return Vc::Common::aligned_malloc<align_>(size); \
  1031. } \
  1032. \
  1033. Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \
  1034. \
  1035. Vc_ALWAYS_INLINE void *operator new[](size_t size) \
  1036. { \
  1037. return Vc::Common::aligned_malloc<align_>(size); \
  1038. } \
  1039. \
  1040. Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \
  1041. \
  1042. Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \
  1043. \
  1044. Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \
  1045. \
  1046. Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \
  1047. { \
  1048. Vc::Common::free(ptr); \
  1049. } \
  1050. \
  1051. Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \
  1052. \
  1053. Vc_NOTHING_EXPECTING_SEMICOLON
  1054. #ifdef Vc_ASSERT
  1055. #define Vc_EXTERNAL_ASSERT 1
  1056. #else
  1057. #ifdef NDEBUG
  1058. #define Vc_ASSERT(x)
  1059. #else
  1060. #include <assert.h>
  1061. #define Vc_ASSERT(x) assert(x);
  1062. #endif
  1063. #endif
  1064. #if defined Vc_CLANG || defined Vc_APPLECLANG
  1065. #define Vc_HAS_BUILTIN(x) __has_builtin(x)
  1066. #else
  1067. #define Vc_HAS_BUILTIN(x) 0
  1068. #endif
  1069. #define Vc_CAT_HELPER_(a,b,c,d) a ##b ##c ##d
  1070. #define Vc_CAT(a,b,c,d) Vc_CAT_HELPER_(a, b, c, d)
  1071. #define Vc_CAT_IMPL(a,b) a ##b
  1072. #define Vc_CAT2(a,b) Vc_CAT_IMPL(a, b)
  1073. #define Vc_APPLY_IMPL_1_(macro,a,b,c,d,e) macro(a)
  1074. #define Vc_APPLY_IMPL_2_(macro,a,b,c,d,e) macro(a, b)
  1075. #define Vc_APPLY_IMPL_3_(macro,a,b,c,d,e) macro(a, b, c)
  1076. #define Vc_APPLY_IMPL_4_(macro,a,b,c,d,e) macro(a, b, c, d)
  1077. #define Vc_APPLY_IMPL_5_(macro,a,b,c,d,e) macro(a, b, c, d, e)
  1078. #define Vc_LIST_FLOAT_VECTOR_TYPES(size,macro,a,b,c,d) \
  1079. size(macro, double_v, a, b, c, d) \
  1080. size(macro, float_v, a, b, c, d)
  1081. #define Vc_LIST_INT_VECTOR_TYPES(size,macro,a,b,c,d) \
  1082. size(macro, int_v, a, b, c, d) \
  1083. size(macro, uint_v, a, b, c, d) \
  1084. size(macro, short_v, a, b, c, d) \
  1085. size(macro, ushort_v, a, b, c, d)
  1086. #define Vc_LIST_VECTOR_TYPES(size,macro,a,b,c,d) \
  1087. Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
  1088. Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
  1089. #define Vc_LIST_COMPARES(size,macro,a,b,c,d) \
  1090. size(macro, ==, a, b, c, d) \
  1091. size(macro, !=, a, b, c, d) \
  1092. size(macro, <=, a, b, c, d) \
  1093. size(macro, >=, a, b, c, d) \
  1094. size(macro, < , a, b, c, d) \
  1095. size(macro, > , a, b, c, d)
  1096. #define Vc_LIST_LOGICAL(size,macro,a,b,c,d) \
  1097. size(macro, &&, a, b, c, d) \
  1098. size(macro, ||, a, b, c, d)
  1099. #define Vc_LIST_BINARY(size,macro,a,b,c,d) \
  1100. size(macro, |, a, b, c, d) \
  1101. size(macro, &, a, b, c, d) \
  1102. size(macro, ^, a, b, c, d)
  1103. #define Vc_LIST_SHIFTS(size,macro,a,b,c,d) \
  1104. size(macro, <<, a, b, c, d) \
  1105. size(macro, >>, a, b, c, d)
  1106. #define Vc_LIST_ARITHMETICS(size,macro,a,b,c,d) \
  1107. size(macro, +, a, b, c, d) \
  1108. size(macro, -, a, b, c, d) \
  1109. size(macro, *, a, b, c, d) \
  1110. size(macro, /, a, b, c, d) \
  1111. size(macro, %, a, b, c, d)
  1112. #define Vc_APPLY_0(_list,macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
  1113. #define Vc_APPLY_1(_list,macro,a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
  1114. #define Vc_APPLY_2(_list,macro,a,b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
  1115. #define Vc_APPLY_3(_list,macro,a,b,c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
  1116. #define Vc_APPLY_4(_list,macro,a,b,c,d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON
  1117. #define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro)
  1118. #define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
  1119. #define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro)
  1120. #define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
  1121. #define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
  1122. #define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
  1123. #define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)
  1124. #define Vc_EXACT_TYPE(_test,_reference,_type) \
  1125. typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type
  1126. #define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)
  1127. #if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
  1128. #define Vc_OFFSETOF(Type,member) (reinterpret_cast<const char *>(&reinterpret_cast<const Type *>(0)->member) - reinterpret_cast<const char *>(0))
  1129. #else
  1130. #define Vc_OFFSETOF(Type,member) offsetof(Type, member)
  1131. #endif
  1132. #if defined(Vc_NO_NOEXCEPT)
  1133. #define Vc_NOEXCEPT throw()
  1134. #else
  1135. #define Vc_NOEXCEPT noexcept
  1136. #endif
  1137. #ifdef Vc_NO_ALWAYS_INLINE
  1138. #undef Vc_ALWAYS_INLINE
  1139. #undef Vc_ALWAYS_INLINE_L
  1140. #undef Vc_ALWAYS_INLINE_R
  1141. #define Vc_ALWAYS_INLINE inline
  1142. #define Vc_ALWAYS_INLINE_L inline
  1143. #define Vc_ALWAYS_INLINE_R
  1144. #undef Vc_INTRINSIC
  1145. #undef Vc_INTRINSIC_L
  1146. #undef Vc_INTRINSIC_R
  1147. #define Vc_INTRINSIC inline
  1148. #define Vc_INTRINSIC_L inline
  1149. #define Vc_INTRINSIC_R
  1150. #endif
  1151. #endif
  1152. namespace Vc_VERSIONED_NAMESPACE
  1153. {
  1154. namespace Permutation
  1155. {
  1156. struct ReversedTag {};
  1157. constexpr ReversedTag Reversed{};
  1158. }
  1159. }
  1160. #endif
  1161. namespace Vc_VERSIONED_NAMESPACE
  1162. {
  1163. using std::size_t;
  1164. using llong = long long;
  1165. using ullong = unsigned long long;
  1166. using ulong = unsigned long;
  1167. using uint = unsigned int;
  1168. using ushort = unsigned short;
  1169. using uchar = unsigned char;
  1170. using schar = signed char;
  1171. struct VectorSpecialInitializerZero {};
  1172. struct VectorSpecialInitializerOne {};
  1173. struct VectorSpecialInitializerIndexesFromZero {};
  1174. constexpr VectorSpecialInitializerZero Zero = {};
  1175. constexpr VectorSpecialInitializerOne One = {};
  1176. constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
  1177. namespace Detail
  1178. {
  1179. template<typename T> struct MayAliasImpl {
  1180. #ifdef __GNUC__
  1181. #pragma GCC diagnostic push
  1182. #pragma GCC diagnostic ignored "-Wattributes"
  1183. #endif
  1184. typedef T type Vc_MAY_ALIAS;
  1185. #ifdef __GNUC__
  1186. #pragma GCC diagnostic pop
  1187. #endif
  1188. };
  1189. }
  1190. #ifdef Vc_ICC
  1191. template <typename T> using MayAlias [[gnu::may_alias]] = T;
  1192. #else
  1193. template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
  1194. #endif
  1195. template <class To, class From> MayAlias<To> &aliasing_cast(From &x)
  1196. {
  1197. return *reinterpret_cast<MayAlias<To> *>(&x);
  1198. }
  1199. template <class To, class From> const MayAlias<To> &aliasing_cast(const From &x)
  1200. {
  1201. return *reinterpret_cast<const MayAlias<To> *>(&x);
  1202. }
  1203. template <class To, class From> MayAlias<To> *aliasing_cast(From *x)
  1204. {
  1205. return reinterpret_cast<MayAlias<To> *>(x);
  1206. }
  1207. template <class To, class From> const MayAlias<To> *aliasing_cast(const From *x)
  1208. {
  1209. return reinterpret_cast<const MayAlias<To> *>(x);
  1210. }
  1211. enum class Operator : char {
  1212. Assign,
  1213. Multiply,
  1214. MultiplyAssign,
  1215. Divide,
  1216. DivideAssign,
  1217. Remainder,
  1218. RemainderAssign,
  1219. Plus,
  1220. PlusAssign,
  1221. Minus,
  1222. MinusAssign,
  1223. RightShift,
  1224. RightShiftAssign,
  1225. LeftShift,
  1226. LeftShiftAssign,
  1227. And,
  1228. AndAssign,
  1229. Xor,
  1230. XorAssign,
  1231. Or,
  1232. OrAssign,
  1233. PreIncrement,
  1234. PostIncrement,
  1235. PreDecrement,
  1236. PostDecrement,
  1237. LogicalAnd,
  1238. LogicalOr,
  1239. Comma,
  1240. UnaryPlus,
  1241. UnaryMinus,
  1242. UnaryNot,
  1243. UnaryOnesComplement,
  1244. CompareEqual,
  1245. CompareNotEqual,
  1246. CompareLess,
  1247. CompareGreater,
  1248. CompareLessEqual,
  1249. CompareGreaterEqual
  1250. };
  1251. template <typename T, std::size_t N> struct array;
  1252. namespace Common {
  1253. template <typename T, std::ptrdiff_t N> class span;
  1254. }
  1255. #ifndef Vc_CHECK_ALIGNMENT
  1256. template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
  1257. #else
  1258. template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
  1259. {
  1260. const size_t s = alignof(_T);
  1261. if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
  1262. fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
  1263. abort();
  1264. }
  1265. }
  1266. #endif
  1267. namespace Common
  1268. {
  1269. template <typename T, std::size_t Pieces, std::size_t Index> struct Segment;
  1270. template<size_t StructSize> class SuccessiveEntries
  1271. {
  1272. #ifdef Vc_MSVC
  1273. using size_type = unsigned;
  1274. #else
  1275. using size_type = size_t;
  1276. #endif
  1277. const size_type m_first;
  1278. public:
  1279. typedef SuccessiveEntries AsArg;
  1280. Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
  1281. Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
  1282. {
  1283. return m_first + offset * StructSize;
  1284. }
  1285. Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
  1286. Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
  1287. {
  1288. return SuccessiveEntries(m_first + rhs.m_first);
  1289. }
  1290. Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
  1291. {
  1292. return SuccessiveEntries(m_first * rhs.m_first);
  1293. }
  1294. Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
  1295. {
  1296. return {m_first << x};
  1297. }
  1298. friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
  1299. {
  1300. return x;
  1301. }
  1302. friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
  1303. {
  1304. return x;
  1305. }
  1306. };
  1307. template <std::size_t alignment>
  1308. Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
  1309. Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;
  1310. template <typename Mask, typename T, typename U>
  1311. using enable_if_mask_converts_implicitly =
  1312. enable_if<(!std::is_same<Mask, Traits::decay<U>>::value &&
  1313. Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
  1314. Traits::is_implicit_cast_allowed_mask<
  1315. Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
  1316. template <typename T, typename U>
  1317. using enable_if_mask_converts_explicitly = enable_if<(
  1318. Traits::isSimdMaskArray<U>::value ||
  1319. (Traits::is_simd_mask<U>::value &&
  1320. !Traits::is_implicit_cast_allowed_mask<
  1321. Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;
  1322. template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;
  1323. template <std::size_t Bytes> class MaskBool;
  1324. template <typename T, typename IndexVector, typename Scale, bool>
  1325. class SubscriptOperation;
  1326. template <class T, class IndexVector, int Scale = 1>
  1327. struct GatherArguments {
  1328. static_assert(std::is_same<T, remove_cvref_t<T>>::value && !std::is_pointer<T>::value,
  1329. "GatherArguments expects an cv unqualified non-ref/ptr type");
  1330. const IndexVector indexes;
  1331. const T *const address;
  1332. };
  1333. template <int Scale, class T, class I>
  1334. GatherArguments<T, I, Scale> make_gather(const T *m, const I &i)
  1335. {
  1336. return {i, m};
  1337. }
  1338. template <typename T, typename IndexVector> struct ScatterArguments
  1339. {
  1340. const IndexVector indexes;
  1341. T *const address;
  1342. };
  1343. template <typename I, I Begin, I End, typename F>
  1344. Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
  1345. {
  1346. }
  1347. template <typename I, I Begin, I End, typename F>
  1348. Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
  1349. {
  1350. f(Begin);
  1351. unrolled_loop<I, Begin + 1, End>(f);
  1352. }
  1353. template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
  1354. {
  1355. unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
  1356. }
  1357. }
  1358. }
  1359. #ifndef VC_COMMON_VECTOR_H_
  1360. #define VC_COMMON_VECTOR_H_
  1361. #include <ratio>
  1362. #ifndef VC_COMMON_ELEMENTREFERENCE_H_
  1363. #define VC_COMMON_ELEMENTREFERENCE_H_
  1364. namespace Vc_VERSIONED_NAMESPACE
  1365. {
  1366. namespace Detail
  1367. {
  1368. template <typename U, typename Accessor = U> class ElementReference
  1369. {
  1370. friend U;
  1371. friend Accessor;
  1372. Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}
  1373. static constexpr bool get_noexcept =
  1374. noexcept(Accessor::get(std::declval<U &>(), int()));
  1375. template <typename T> static constexpr bool set_noexcept()
  1376. {
  1377. return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
  1378. }
  1379. public:
  1380. using value_type = typename U::value_type;
  1381. Vc_INTRINSIC ElementReference(const ElementReference &) = delete;
  1382. Vc_INTRINSIC ElementReference(ElementReference &&) = default;
  1383. Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
  1384. {
  1385. return Accessor::get(obj, index);
  1386. }
  1387. template <typename T>
  1388. Vc_INTRINSIC ElementReference &operator=(T &&x) &&
  1389. noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
  1390. {
  1391. Accessor::set(obj, index, std::forward<T>(x));
  1392. return *this;
  1393. }
  1394. #define Vc_OP_(op_) \
  1395. template <typename T, typename R = decltype(std::declval<const value_type &>() \
  1396. op_ std::declval<T>())> \
  1397. Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \
  1398. noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(), \
  1399. std::declval<R &&>()))) \
  1400. { \
  1401. const value_type &lhs = Accessor::get(obj, index); \
  1402. Accessor::set(obj, index, lhs op_ std::forward<T>(x)); \
  1403. return *this; \
  1404. }
  1405. Vc_ALL_ARITHMETICS(Vc_OP_);
  1406. Vc_ALL_SHIFTS(Vc_OP_);
  1407. Vc_ALL_BINARY(Vc_OP_);
  1408. #undef Vc_OP_
  1409. template <typename = void>
  1410. Vc_INTRINSIC ElementReference &operator++() &&
  1411. noexcept(noexcept(std::declval<value_type &>() =
  1412. Accessor::get(std::declval<U &>(), int())) &&
  1413. set_noexcept<decltype(++std::declval<value_type &>())>())
  1414. {
  1415. value_type x = Accessor::get(obj, index);
  1416. Accessor::set(obj, index, ++x);
  1417. return *this;
  1418. }
  1419. template <typename = void>
  1420. Vc_INTRINSIC value_type operator++(int) &&
  1421. noexcept(noexcept(std::declval<value_type &>() =
  1422. Accessor::get(std::declval<U &>(), int())) &&
  1423. set_noexcept<decltype(std::declval<value_type &>()++)>())
  1424. {
  1425. const value_type r = Accessor::get(obj, index);
  1426. value_type x = r;
  1427. Accessor::set(obj, index, ++x);
  1428. return r;
  1429. }
  1430. template <typename = void>
  1431. Vc_INTRINSIC ElementReference &operator--() &&
  1432. noexcept(noexcept(std::declval<value_type &>() =
  1433. Accessor::get(std::declval<U &>(), int())) &&
  1434. set_noexcept<decltype(--std::declval<value_type &>())>())
  1435. {
  1436. value_type x = Accessor::get(obj, index);
  1437. Accessor::set(obj, index, --x);
  1438. return *this;
  1439. }
  1440. template <typename = void>
  1441. Vc_INTRINSIC value_type operator--(int) &&
  1442. noexcept(noexcept(std::declval<value_type &>() =
  1443. Accessor::get(std::declval<U &>(), int())) &&
  1444. set_noexcept<decltype(std::declval<value_type &>()--)>())
  1445. {
  1446. const value_type r = Accessor::get(obj, index);
  1447. value_type x = r;
  1448. Accessor::set(obj, index, --x);
  1449. return r;
  1450. }
  1451. friend void swap(ElementReference &&a, ElementReference &&b) {
  1452. value_type tmp(a);
  1453. static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
  1454. static_cast<ElementReference &&>(b) = tmp;
  1455. }
  1456. friend void swap(value_type &a, ElementReference &&b) {
  1457. value_type tmp(a);
  1458. a = static_cast<value_type>(b);
  1459. static_cast<ElementReference &&>(b) = tmp;
  1460. }
  1461. friend void swap(ElementReference &&a, value_type &b) {
  1462. value_type tmp(a);
  1463. static_cast<ElementReference &&>(a) = b;
  1464. b = tmp;
  1465. }
  1466. private:
  1467. int index;
  1468. U &obj;
  1469. };
  1470. }
  1471. }
  1472. #endif
  1473. #ifndef VC_COMMON_VECTORABI_H_
  1474. #define VC_COMMON_VECTORABI_H_
  1475. namespace Vc_VERSIONED_NAMESPACE
  1476. {
  1477. namespace VectorAbi
  1478. {
  1479. template <typename T>
  1480. using Avx1Abi = typename std::conditional<std::is_integral<T>::value, VectorAbi::Sse,
  1481. VectorAbi::Avx>::type;
  1482. template <typename T> struct DeduceCompatible {
  1483. #ifdef __x86_64__
  1484. using type = Sse;
  1485. #else
  1486. using type = Scalar;
  1487. #endif
  1488. };
  1489. template <typename T>
  1490. struct DeduceBest {
  1491. using type = typename std::conditional<
  1492. CurrentImplementation::is(ScalarImpl), Scalar,
  1493. typename std::conditional<
  1494. CurrentImplementation::is_between(SSE2Impl, SSE42Impl), Sse,
  1495. typename std::conditional<
  1496. CurrentImplementation::is(AVXImpl), Avx1Abi<T>,
  1497. typename std::conditional<CurrentImplementation::is(AVX2Impl), Avx,
  1498. void>::type>::type>::type>::type;
  1499. };
  1500. template <typename T> using Best = typename DeduceBest<T>::type;
  1501. }
  1502. }
  1503. #ifndef VC_COMMON_SIMDARRAYFWD_H_
  1504. #define VC_COMMON_SIMDARRAYFWD_H_
  1505. #ifndef VC_SSE_TYPES_H_
  1506. #define VC_SSE_TYPES_H_
  1507. #ifdef Vc_DEFAULT_IMPL_SSE
  1508. #define Vc_DOUBLE_V_SIZE 2
  1509. #define Vc_FLOAT_V_SIZE 4
  1510. #define Vc_INT_V_SIZE 4
  1511. #define Vc_UINT_V_SIZE 4
  1512. #define Vc_SHORT_V_SIZE 8
  1513. #define Vc_USHORT_V_SIZE 8
  1514. #endif
  1515. namespace Vc_VERSIONED_NAMESPACE
  1516. {
  1517. namespace SSE
  1518. {
  1519. template <typename T> using Vector = Vc::Vector<T, VectorAbi::Sse>;
  1520. typedef Vector<double> double_v;
  1521. typedef Vector<float> float_v;
  1522. typedef Vector<int> int_v;
  1523. typedef Vector<unsigned int> uint_v;
  1524. typedef Vector<short> short_v;
  1525. typedef Vector<unsigned short> ushort_v;
  1526. template <typename T> using Mask = Vc::Mask<T, VectorAbi::Sse>;
  1527. typedef Mask<double> double_m;
  1528. typedef Mask<float> float_m;
  1529. typedef Mask<int> int_m;
  1530. typedef Mask<unsigned int> uint_m;
  1531. typedef Mask<short> short_m;
  1532. typedef Mask<unsigned short> ushort_m;
  1533. template <typename T> struct Const;
  1534. template <typename T> struct is_vector : public std::false_type {};
  1535. template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
  1536. template <typename T> struct is_mask : public std::false_type {};
  1537. template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
  1538. }
  1539. namespace Traits
  1540. {
  1541. template <class T> struct
  1542. is_simd_vector_internal<Vector<T, VectorAbi::Sse>>
  1543. : public is_valid_vector_argument<T> {};
  1544. template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Sse>>
  1545. : public std::true_type {};
  1546. }
  1547. }
  1548. #endif
  1549. #ifndef VC_AVX_TYPES_H_
  1550. #define VC_AVX_TYPES_H_
  1551. #ifndef VC_AVX_MACROS_H_
  1552. #define VC_AVX_MACROS_H_
  1553. #endif
  1554. #ifdef Vc_DEFAULT_IMPL_AVX2
  1555. #define Vc_DOUBLE_V_SIZE 4
  1556. #define Vc_FLOAT_V_SIZE 8
  1557. #define Vc_INT_V_SIZE 8
  1558. #define Vc_UINT_V_SIZE 8
  1559. #define Vc_SHORT_V_SIZE 16
  1560. #define Vc_USHORT_V_SIZE 16
  1561. #elif defined Vc_DEFAULT_IMPL_AVX
  1562. #define Vc_DOUBLE_V_SIZE 4
  1563. #define Vc_FLOAT_V_SIZE 8
  1564. #define Vc_INT_V_SIZE 4
  1565. #define Vc_UINT_V_SIZE 4
  1566. #define Vc_SHORT_V_SIZE 8
  1567. #define Vc_USHORT_V_SIZE 8
  1568. #endif
  1569. namespace Vc_VERSIONED_NAMESPACE
  1570. {
  1571. namespace AVX
  1572. {
  1573. template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
  1574. typedef Vector<double> double_v;
  1575. typedef Vector<float> float_v;
  1576. typedef Vector<int> int_v;
  1577. typedef Vector<unsigned int> uint_v;
  1578. typedef Vector<short> short_v;
  1579. typedef Vector<unsigned short> ushort_v;
  1580. template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
  1581. typedef Mask<double> double_m;
  1582. typedef Mask<float> float_m;
  1583. typedef Mask<int> int_m;
  1584. typedef Mask<unsigned int> uint_m;
  1585. typedef Mask<short> short_m;
  1586. typedef Mask<unsigned short> ushort_m;
  1587. template <typename T> struct Const;
  1588. template <typename T> struct is_vector : public std::false_type {};
  1589. template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
  1590. template <typename T> struct is_mask : public std::false_type {};
  1591. template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
  1592. }
  1593. namespace AVX2
  1594. {
  1595. template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
  1596. using double_v = Vector<double>;
  1597. using float_v = Vector< float>;
  1598. using int_v = Vector< int>;
  1599. using uint_v = Vector< uint>;
  1600. using short_v = Vector< short>;
  1601. using ushort_v = Vector<ushort>;
  1602. template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
  1603. using double_m = Mask<double>;
  1604. using float_m = Mask< float>;
  1605. using llong_m = Mask< llong>;
  1606. using ullong_m = Mask<ullong>;
  1607. using long_m = Mask< long>;
  1608. using ulong_m = Mask< ulong>;
  1609. using int_m = Mask< int>;
  1610. using uint_m = Mask< uint>;
  1611. using short_m = Mask< short>;
  1612. using ushort_m = Mask<ushort>;
  1613. using schar_m = Mask< schar>;
  1614. using uchar_m = Mask< uchar>;
  1615. template <typename T> struct is_vector : public std::false_type {};
  1616. template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
  1617. template <typename T> struct is_mask : public std::false_type {};
  1618. template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
  1619. }
  1620. namespace Traits
  1621. {
  1622. template <class T> struct
  1623. is_simd_vector_internal<Vector<T, VectorAbi::Avx>>
  1624. : public is_valid_vector_argument<T> {};
  1625. template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>>
  1626. : public std::true_type {};
  1627. }
  1628. }
  1629. #endif
  1630. #ifndef VC_COMMON_UTILITY_H_
  1631. #define VC_COMMON_UTILITY_H_
  1632. namespace Vc_VERSIONED_NAMESPACE
  1633. {
  1634. namespace Common
  1635. {
  1636. template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
  1637. template <size_t x>
  1638. struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
  1639. };
  1640. template <size_t x>
  1641. struct NextPowerOfTwo<x, false>
  1642. : public std::integral_constant<
  1643. size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
  1644. };
  1645. template <size_t A>
  1646. struct BoundedAlignment : public std::integral_constant<size_t,
  1647. #if defined Vc_MSVC || defined Vc_GCC
  1648. ((A - 1) &
  1649. #ifdef Vc_MSVC
  1650. 31
  1651. #elif defined __AVX__
  1652. 255
  1653. #else
  1654. 127
  1655. #endif
  1656. ) + 1
  1657. #else
  1658. A
  1659. #endif
  1660. > {
  1661. };
  1662. template <std::size_t N> static constexpr std::size_t left_size()
  1663. {
  1664. return Common::NextPowerOfTwo<(N + 1) / 2>::value;
  1665. }
  1666. template <std::size_t N> static constexpr std::size_t right_size()
  1667. {
  1668. return N - left_size<N>();
  1669. }
  1670. }
  1671. }
  1672. #endif
  1673. namespace Vc_VERSIONED_NAMESPACE
  1674. {
  1675. template <class T, int N>
  1676. class Vector<T, simd_abi::fixed_size<N>> : public SimdArray<T, N>
  1677. {
  1678. using SimdArray<T, N>::SimdArray;
  1679. public:
  1680. Vc_INTRINSIC Vector(const Vector &x) : SimdArray<T, N>(x) {}
  1681. Vc_INTRINSIC Vector &operator=(const Vector &x)
  1682. {
  1683. SimdArray<T, N>::operator=(x);
  1684. return *this;
  1685. }
  1686. Vector() = default;
  1687. using abi_type = simd_abi::fixed_size<N>;
  1688. using abi = abi_type;
  1689. Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of "
  1690. "Vector::IndexesFromZero()") static Vector IndexesFromZero()
  1691. {
  1692. return Vector([](size_t i) -> T { return i; });
  1693. }
  1694. Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; }
  1695. Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; }
  1696. };
  1697. template <class T, int N>
  1698. class Mask<T, simd_abi::fixed_size<N>> : public SimdMaskArray<T, N>
  1699. {
  1700. using SimdMaskArray<T, N>::SimdMaskArray;
  1701. public:
  1702. Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray<T, N>(x) {}
  1703. Vc_INTRINSIC Mask &operator=(const Mask &x)
  1704. {
  1705. SimdMaskArray<T, N>::operator=(x);
  1706. return *this;
  1707. }
  1708. Mask() = default;
  1709. using abi_type = simd_abi::fixed_size<N>;
  1710. using abi = abi_type;
  1711. };
  1712. template <typename T, std::size_t N> struct SimdArrayTraits {
  1713. static constexpr std::size_t N0 = Common::left_size<N>();
  1714. static constexpr std::size_t N1 = Common::right_size<N>();
  1715. using storage_type0 = fixed_size_simd<T, N0>;
  1716. using storage_type1 = fixed_size_simd<T, N1>;
  1717. };
  1718. template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
  1719. Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
  1720. SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
  1721. template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
  1722. Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
  1723. SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
  1724. template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
  1725. Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
  1726. const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
  1727. template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
  1728. Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
  1729. const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
  1730. template <typename T, std::size_t N, typename V>
  1731. Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
  1732. template <typename T, std::size_t N, typename V>
  1733. Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
  1734. namespace Traits
  1735. {
  1736. template <class T> struct is_fixed_size_simd : std::false_type {
  1737. };
  1738. template <class T, int N>
  1739. struct is_fixed_size_simd<fixed_size_simd<T, N>> : std::true_type {
  1740. };
  1741. template <class T, int N>
  1742. struct is_fixed_size_simd<fixed_size_simd_mask<T, N>> : std::true_type {
  1743. };
  1744. template <class T, int N>
  1745. struct is_simd_vector_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {};
  1746. template <class T, int N>
  1747. struct is_simd_mask_internal<fixed_size_simd_mask<T, N>> : is_valid_vector_argument<T> {};
  1748. template <typename T, std::size_t N, typename V>
  1749. struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : is_valid_vector_argument<T> {};
  1750. template <typename T, int N>
  1751. struct is_atomic_simdarray_internal<fixed_size_simd<T, N>>
  1752. : is_atomic_simdarray_internal<SimdArray<T, N>> {
  1753. };
  1754. template <typename T, std::size_t N, typename V>
  1755. struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>>
  1756. : is_valid_vector_argument<T> {
  1757. };
  1758. template <typename T, int N>
  1759. struct is_atomic_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
  1760. : is_atomic_simd_mask_array_internal<SimdMaskArray<T, N>> {
  1761. };
  1762. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  1763. struct is_simdarray_internal<SimdArray<T, N, VectorType, M>>
  1764. : is_valid_vector_argument<T> {
  1765. };
  1766. template <typename T, int N>
  1767. struct is_simdarray_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {
  1768. };
  1769. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  1770. struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>>
  1771. : is_valid_vector_argument<T> {
  1772. };
  1773. template <typename T, int N>
  1774. struct is_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
  1775. : is_valid_vector_argument<T> {
  1776. };
  1777. template <typename T, std::size_t N, typename V, std::size_t M>
  1778. struct is_integral_internal<SimdArray<T, N, V, M>, false> : std::is_integral<T> {
  1779. };
  1780. template <typename T, std::size_t N, typename V, std::size_t M>
  1781. struct is_floating_point_internal<SimdArray<T, N, V, M>, false>
  1782. : std::is_floating_point<T> {
  1783. };
  1784. template <typename T, std::size_t N, typename V, std::size_t M>
  1785. struct is_signed_internal<SimdArray<T, N, V, M>, false> : std::is_signed<T> {
  1786. };
  1787. template <typename T, std::size_t N, typename V, std::size_t M>
  1788. struct is_unsigned_internal<SimdArray<T, N, V, M>, false> : std::is_unsigned<T> {
  1789. };
  1790. template <typename T, std::size_t N>
  1791. struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : std::true_type {
  1792. };
  1793. }
  1794. }
  1795. #endif
  1796. namespace Vc_VERSIONED_NAMESPACE
  1797. {
  1798. namespace detail
  1799. {
  1800. template <class T> struct is_fixed_size_abi : std::false_type {
  1801. };
  1802. template <int N> struct is_fixed_size_abi<simd_abi::fixed_size<N>> : std::true_type {
  1803. };
  1804. template <class T>
  1805. using not_fixed_size_abi = typename std::enable_if<!is_fixed_size_abi<T>::value, T>::type;
  1806. }
  1807. }
  1808. #endif
  1809. #ifndef VC_COMMON_VECTORTRAITS_H_
  1810. #define VC_COMMON_VECTORTRAITS_H_
  1811. namespace Vc_VERSIONED_NAMESPACE
  1812. {
  1813. template <typename T, typename Abi> struct VectorTraits;
  1814. }
  1815. #endif
  1816. #ifndef VC_COMMON_LOADSTOREFLAGS_H_
  1817. #define VC_COMMON_LOADSTOREFLAGS_H_
  1818. namespace Vc_VERSIONED_NAMESPACE
  1819. {
  1820. struct Exclusive {};
  1821. struct Shared {};
  1822. namespace LoadStoreFlags
  1823. {
  1824. struct StreamingFlag {};
  1825. struct UnalignedFlag {};
  1826. struct PrefetchFlagBase {};
  1827. template <size_t L1 = 16 * 64, size_t L2 = 128 * 64, typename ExclusiveOrShared_ = void>
  1828. struct PrefetchFlag : public PrefetchFlagBase {
  1829. typedef ExclusiveOrShared_ ExclusiveOrShared;
  1830. static constexpr size_t L1Stride = L1;
  1831. static constexpr size_t L2Stride = L2;
  1832. static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
  1833. static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
  1834. };
  1835. template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
  1836. {
  1837. typedef Default type;
  1838. };
  1839. template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
  1840. {
  1841. typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
  1842. };
  1843. #ifdef Vc_ICC
  1844. #pragma warning(disable: 177)
  1845. #endif
  1846. template<typename... Flags> struct LoadStoreFlags
  1847. {
  1848. private:
  1849. typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;
  1850. public:
  1851. constexpr LoadStoreFlags() {}
  1852. static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
  1853. static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
  1854. static constexpr bool IsAligned = !IsUnaligned;
  1855. static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
  1856. static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
  1857. static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
  1858. static constexpr size_t L1Stride = Prefetch::L1Stride;
  1859. static constexpr size_t L2Stride = Prefetch::L2Stride;
  1860. typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;
  1861. typedef typename std::conditional<IsAligned && !IsStreaming, void *, void>::type EnableIfAligned;
  1862. typedef typename std::conditional<IsAligned && IsStreaming, void *, void>::type EnableIfStreaming;
  1863. typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
  1864. typedef typename std::conditional<IsUnaligned && IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
  1865. typedef typename std::conditional<IsUnaligned , void *, void>::type EnableIfUnaligned;
  1866. typedef typename std::conditional<!IsUnaligned , void *, void>::type EnableIfNotUnaligned;
  1867. typedef typename std::conditional<IsPrefetch , void *, void>::type EnableIfPrefetch;
  1868. typedef typename std::conditional<!IsPrefetch , void *, void>::type EnableIfNotPrefetch;
  1869. };
  1870. template<> struct LoadStoreFlags<>
  1871. {
  1872. constexpr LoadStoreFlags() {}
  1873. static constexpr bool IsStreaming = false;
  1874. static constexpr bool IsUnaligned = false;
  1875. static constexpr bool IsAligned = !IsUnaligned;
  1876. static constexpr bool IsPrefetch = false;
  1877. static constexpr bool IsExclusivePrefetch = false;
  1878. static constexpr bool IsSharedPrefetch = false;
  1879. static constexpr size_t L1Stride = 0;
  1880. static constexpr size_t L2Stride = 0;
  1881. typedef void* EnableIfAligned;
  1882. typedef void* EnableIfNotUnaligned;
  1883. typedef void* EnableIfNotPrefetch;
  1884. };
  1885. template<typename... LFlags, typename... RFlags>
  1886. constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
  1887. {
  1888. return LoadStoreFlags<LFlags..., RFlags...>();
  1889. }
  1890. }
  1891. using LoadStoreFlags::PrefetchFlag;
  1892. typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
  1893. typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
  1894. typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;
  1895. typedef UnalignedTag DefaultLoadTag;
  1896. typedef UnalignedTag DefaultStoreTag;
  1897. constexpr AlignedTag Aligned;
  1898. constexpr UnalignedTag Unaligned;
  1899. constexpr StreamingTag Streaming;
  1900. constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
  1901. template <size_t L1 = PrefetchFlag<>::L1Stride,
  1902. size_t L2 = PrefetchFlag<>::L2Stride,
  1903. typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
  1904. struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
  1905. {
  1906. };
  1907. namespace Traits
  1908. {
  1909. template <typename... Ts>
  1910. struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
  1911. {
  1912. };
  1913. template <size_t L1, size_t L2, typename ExclusiveOrShared>
  1914. struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
  1915. {
  1916. };
  1917. }
  1918. }
  1919. #endif
  1920. #ifndef VC_COMMON_WRITEMASKEDVECTOR_H_
  1921. #define VC_COMMON_WRITEMASKEDVECTOR_H_
  1922. #include <utility>
  1923. namespace Vc_VERSIONED_NAMESPACE
  1924. {
  1925. namespace Common
  1926. {
  1927. template <typename V, typename M = typename V::Mask> class WriteMaskedVector
  1928. {
  1929. static_assert(
  1930. V::Size == M::Size,
  1931. "incorrect use of Vc::Common::WriteMaskedVector<V, M>. V and M must have the same «Size».");
  1932. public:
  1933. typedef M Mask;
  1934. static constexpr size_t Size = V::Size;
  1935. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
  1936. Vc_INTRINSIC WriteMaskedVector(V &v, const Mask &k) : mask(k), vec(v)
  1937. {
  1938. }
  1939. Vc_INTRINSIC V &operator++()
  1940. {
  1941. V one = V::One();
  1942. one.setZeroInverted(mask);
  1943. return vec += one;
  1944. }
  1945. Vc_INTRINSIC V &operator--()
  1946. {
  1947. V one = V::One();
  1948. one.setZeroInverted(mask);
  1949. return vec -= one;
  1950. }
  1951. Vc_INTRINSIC V operator++(int)
  1952. {
  1953. V ret(vec);
  1954. operator++();
  1955. return ret;
  1956. }
  1957. Vc_INTRINSIC V operator--(int)
  1958. {
  1959. V ret(vec);
  1960. operator--();
  1961. return ret;
  1962. }
  1963. #define Vc_OPERATOR_(op) \
  1964. template <typename U> Vc_ALWAYS_INLINE void operator op##=(U &&x) \
  1965. { \
  1966. operator=(static_cast<V>(vec op std::forward<U>(x))); \
  1967. }
  1968. Vc_ALL_BINARY(Vc_OPERATOR_);
  1969. Vc_ALL_ARITHMETICS(Vc_OPERATOR_);
  1970. Vc_ALL_SHIFTS(Vc_OPERATOR_);
  1971. #undef Vc_OPERATOR_
  1972. Vc_ALWAYS_INLINE void operator=(const V &x)
  1973. {
  1974. vec.assign(x, mask);
  1975. }
  1976. template <typename T, typename I, typename S>
  1977. Vc_ALWAYS_INLINE void operator=(SubscriptOperation<T, I, S, true> &&x)
  1978. {
  1979. vec.gather(std::move(x).gatherArguments(), mask);
  1980. }
  1981. template <typename F> Vc_INTRINSIC void call(const F &f) const
  1982. {
  1983. return vec.call(f, mask);
  1984. }
  1985. template <typename F> Vc_INTRINSIC V apply(const F &f) const
  1986. {
  1987. return vec.apply(f, mask);
  1988. }
  1989. template <typename F> Vc_INTRINSIC void call(F &&f) const
  1990. {
  1991. return vec.call(std::forward<F>(f), mask);
  1992. }
  1993. template <typename F> Vc_INTRINSIC V apply(F &&f) const
  1994. {
  1995. return vec.apply(std::forward<F>(f), mask);
  1996. }
  1997. private:
  1998. #ifdef Vc_ICC
  1999. const Mask &mask;
  2000. #else
  2001. const Mask mask;
  2002. #endif
  2003. V &vec;
  2004. };
  2005. }
  2006. }
  2007. #endif
  2008. #ifndef VC_COMMON_DETAIL_H_
  2009. #define VC_COMMON_DETAIL_H_
  2010. #include <vector>
  2011. namespace Vc_VERSIONED_NAMESPACE
  2012. {
  2013. namespace Common
  2014. {
  2015. template <typename IV>
  2016. Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
  2017. sizeof(typename IV::EntryType) >= sizeof(int)),
  2018. const IV &>
  2019. convertIndexVector(const IV &indexVector)
  2020. {
  2021. return indexVector;
  2022. }
  2023. template <typename IV>
  2024. Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
  2025. sizeof(typename IV::EntryType) < sizeof(int)),
  2026. fixed_size_simd<int, IV::Size>>
  2027. convertIndexVector(const IV &indexVector)
  2028. {
  2029. return static_cast<fixed_size_simd<int, IV::Size>>(indexVector);
  2030. }
  2031. template <class T> using promoted_type = decltype(std::declval<T>() + 1);
  2032. template <typename T, std::size_t N>
  2033. Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
  2034. convertIndexVector(const std::array<T, N> &indexVector)
  2035. {
  2036. return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
  2037. Vc::Unaligned};
  2038. }
  2039. template <typename T, std::size_t N>
  2040. Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
  2041. convertIndexVector(const Vc::array<T, N> &indexVector)
  2042. {
  2043. return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
  2044. Vc::Unaligned};
  2045. }
  2046. template <typename T, std::size_t N>
  2047. Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
  2048. convertIndexVector(const T (&indexVector)[N])
  2049. {
  2050. return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
  2051. Vc::Unaligned};
  2052. }
  2053. #ifndef Vc_MSVC
  2054. template <class T>
  2055. enable_if<std::is_pointer<T>::value, void> convertIndexVector(T indexVector) = delete;
  2056. #endif
  2057. template <typename T>
  2058. Vc_INTRINSIC std::vector<promoted_type<T>> convertIndexVector(
  2059. const std::initializer_list<T> &indexVector)
  2060. {
  2061. return {begin(indexVector), end(indexVector)};
  2062. }
  2063. template <typename T>
  2064. Vc_INTRINSIC
  2065. enable_if<(std::is_integral<T>::value && sizeof(T) >= sizeof(int)), std::vector<T>>
  2066. convertIndexVector(const std::vector<T> &indexVector)
  2067. {
  2068. return indexVector;
  2069. }
  2070. template <typename T>
  2071. Vc_INTRINSIC enable_if<(std::is_integral<T>::value && sizeof(T) < sizeof(int)),
  2072. std::vector<promoted_type<T>>>
  2073. convertIndexVector(const std::vector<T> &indexVector)
  2074. {
  2075. return {std::begin(indexVector), std::end(indexVector)};
  2076. }
  2077. template <class T,
  2078. class = enable_if<
  2079. (!std::is_pointer<T>::value && !Traits::is_simd_vector<T>::value &&
  2080. !std::is_lvalue_reference<decltype(std::declval<const T &>()[0])>::value)>>
  2081. Vc_INTRINSIC const T &convertIndexVector(const T &i)
  2082. {
  2083. return i;
  2084. }
  2085. }
  2086. }
  2087. #endif
  2088. namespace Vc_VERSIONED_NAMESPACE
  2089. {
  2090. template <typename T, typename Abi,
  2091. typename = enable_if<std::is_floating_point<T>::value &&
  2092. !detail::is_fixed_size_abi<Abi>::value>>
  2093. inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);
  2094. template <typename T, typename Abi,
  2095. typename = enable_if<std::is_floating_point<T>::value &&
  2096. !detail::is_fixed_size_abi<Abi>::value>>
  2097. inline Vector<T, Abi> exponent(Vector<T, Abi> x);
  2098. template <typename T, typename Abi>
  2099. Vc_INTRINSIC Vc_CONST typename Vector<T, detail::not_fixed_size_abi<Abi>>::MaskType
  2100. isnegative(Vector<T, Abi> x)
  2101. {
  2102. return x < Vector<T, Abi>::Zero();
  2103. }
  2104. template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
  2105. {
  2106. public:
  2107. static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
  2108. static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();
  2109. using abi = Abi;
  2110. using EntryType = typename VectorTraits<T, Abi>::EntryType;
  2111. using value_type = EntryType;
  2112. using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
  2113. using VectorType = typename VectorTraits<T, Abi>::VectorType;
  2114. using vector_type = VectorType;
  2115. using MaskType = Vc::Mask<T, Abi>;
  2116. using mask_type = MaskType;
  2117. using MaskArgument = MaskType;
  2118. using VectorArgument = Vector;
  2119. using IndexType = Vc::fixed_size_simd<int, VectorTraits<T, Abi>::size()>;
  2120. using index_type = IndexType;
  2121. using reference = Detail::ElementReference<Vector>;
  2122. static inline Vector Zero();
  2123. static inline Vector One();
  2124. static inline Vector IndexesFromZero();
  2125. static inline Vector Random();
  2126. template <typename G> static inline Vector generate(G gen);
  2127. inline Vector() = default;
  2128. explicit inline Vector(VectorSpecialInitializerZero);
  2129. explicit inline Vector(VectorSpecialInitializerOne);
  2130. explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
  2131. template <typename U>
  2132. inline Vector(Vector<U, abi> x,
  2133. enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
  2134. #if Vc_IS_VERSION_1
  2135. template <typename U>
  2136. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  2137. "vector types") inline explicit Vector(
  2138. Vector<U, abi> x,
  2139. enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
  2140. #endif
  2141. inline Vector(EntryType a);
  2142. template <typename U>
  2143. inline Vector(U a, enable_if<std::is_same<U, int>::value &&
  2144. !std::is_same<U, EntryType>::value> = nullarg);
  2145. explicit Vc_INTRINSIC Vector(const EntryType *mem)
  2146. {
  2147. load(mem);
  2148. }
  2149. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2150. explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
  2151. {
  2152. load(mem, flags);
  2153. }
  2154. template <typename U, typename Flags = DefaultLoadTag,
  2155. typename = enable_if<
  2156. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  2157. sizeof(EntryType) >= sizeof(U)) &&
  2158. std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2159. explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
  2160. {
  2161. load<U, Flags>(x, flags);
  2162. }
  2163. Vc_INTRINSIC void load(const EntryType *mem)
  2164. {
  2165. load(mem, DefaultLoadTag());
  2166. }
  2167. template <typename Flags>
  2168. Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
  2169. load(const EntryType *mem, Flags flags)
  2170. {
  2171. load<EntryType, Flags>(mem, flags);
  2172. }
  2173. private:
  2174. template <typename U, typename Flags>
  2175. struct load_concept : public std::enable_if<
  2176. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  2177. sizeof(EntryType) >= sizeof(U)) &&
  2178. std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
  2179. {};
  2180. public:
  2181. template <typename U, typename Flags = DefaultLoadTag>
  2182. Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
  2183. template <
  2184. typename U,
  2185. typename Flags = DefaultStoreTag,
  2186. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2187. Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
  2188. template <
  2189. typename U,
  2190. typename Flags = DefaultStoreTag,
  2191. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2192. Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
  2193. Vc_INTRINSIC void store(EntryType *mem) const
  2194. {
  2195. store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
  2196. }
  2197. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2198. Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
  2199. {
  2200. store<EntryType, Flags>(mem, flags);
  2201. }
  2202. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
  2203. {
  2204. store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
  2205. }
  2206. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2207. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
  2208. {
  2209. store<EntryType, Flags>(mem, mask, flags);
  2210. }
  2211. inline void setZero();
  2212. inline void setZero(MaskType mask);
  2213. inline void setZeroInverted(MaskType mask);
  2214. inline void setQnan();
  2215. inline void setQnan(MaskType mask);
  2216. #define Vc_CURRENT_CLASS_NAME Vector
  2217. #ifndef Vc_CURRENT_CLASS_NAME
  2218. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  2219. #endif
  2220. private:
  2221. template <class MT, class IT, int Scale = 1>
  2222. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  2223. template <class MT, class IT, int Scale = 1>
  2224. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  2225. MaskArgument mask);
  2226. public:
  2227. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  2228. static_assert( \
  2229. std::is_convertible<MT, EntryType>::value, \
  2230. "The memory pointer needs to point to a type that can be converted to the " \
  2231. "EntryType of this SIMD vector type."); \
  2232. static_assert( \
  2233. Vc::Traits::has_subscript_operator<IT>::value, \
  2234. "The indexes argument must be a type that implements the subscript operator."); \
  2235. static_assert( \
  2236. !Traits::is_simd_vector<IT>::value || \
  2237. Traits::simd_vector_size<IT>::value >= Size, \
  2238. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  2239. "have at least as many entries as this SIMD vector."); \
  2240. static_assert( \
  2241. !std::is_array<T>::value || \
  2242. (std::rank<T>::value == 1 && \
  2243. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  2244. "If you use a simple array for the indexes parameter, the array must have " \
  2245. "at least as many entries as this SIMD vector.")
  2246. template <typename MT, typename IT,
  2247. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  2248. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  2249. {
  2250. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2251. gatherImplementation(
  2252. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  2253. }
  2254. template <class MT, class IT, int Scale>
  2255. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  2256. {
  2257. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2258. gatherImplementation(args);
  2259. }
  2260. template <typename MT, typename IT,
  2261. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  2262. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  2263. MaskArgument mask)
  2264. {
  2265. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2266. gatherImplementation(
  2267. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  2268. }
  2269. template <class MT, class IT, int Scale>
  2270. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  2271. MaskArgument mask)
  2272. {
  2273. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2274. gatherImplementation(args, mask);
  2275. }
  2276. template <typename MT, typename IT,
  2277. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  2278. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  2279. {
  2280. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2281. gatherImplementation(
  2282. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  2283. }
  2284. template <typename MT, typename IT,
  2285. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  2286. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  2287. {
  2288. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2289. gatherImplementation(
  2290. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  2291. }
  2292. template <class MT, class IT, int Scale>
  2293. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  2294. {
  2295. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2296. gatherImplementation(args);
  2297. }
  2298. template <class MT, class IT, int Scale>
  2299. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  2300. MaskArgument mask)
  2301. {
  2302. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2303. gatherImplementation(args, mask);
  2304. }
  2305. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  2306. private:
  2307. template <typename MT, typename IT>
  2308. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  2309. template <typename MT, typename IT>
  2310. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  2311. public:
  2312. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  2313. static_assert( \
  2314. std::is_convertible<EntryType, MT>::value, \
  2315. "The memory pointer needs to point to a type that the EntryType of this " \
  2316. "SIMD vector type can be converted to."); \
  2317. static_assert( \
  2318. Vc::Traits::has_subscript_operator<IT>::value, \
  2319. "The indexes argument must be a type that implements the subscript operator."); \
  2320. static_assert( \
  2321. !Traits::is_simd_vector<IT>::value || \
  2322. Traits::simd_vector_size<IT>::value >= Size, \
  2323. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  2324. "have at least as many entries as this SIMD vector."); \
  2325. static_assert( \
  2326. !std::is_array<T>::value || \
  2327. (std::rank<T>::value == 1 && \
  2328. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  2329. "If you use a simple array for the indexes parameter, the array must have " \
  2330. "at least as many entries as this SIMD vector.")
  2331. template <typename MT,
  2332. typename IT,
  2333. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  2334. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  2335. {
  2336. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  2337. scatterImplementation(mem, std::forward<IT>(indexes));
  2338. }
  2339. template <typename MT,
  2340. typename IT,
  2341. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  2342. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  2343. {
  2344. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  2345. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  2346. }
  2347. template <typename MT, typename IT>
  2348. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  2349. {
  2350. scatter(args.address, args.indexes);
  2351. }
  2352. template <typename MT, typename IT>
  2353. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  2354. {
  2355. scatter(args.address, args.indexes, mask);
  2356. }
  2357. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  2358. #undef Vc_CURRENT_CLASS_NAME
  2359. inline reference operator[](size_t index) noexcept;
  2360. inline EntryType operator[](size_t index) const noexcept;
  2361. inline MaskType operator!() const;
  2362. inline Vector operator~() const;
  2363. inline Vector operator-() const;
  2364. inline Vector operator+() const;
  2365. inline Vector &operator++();
  2366. inline Vector operator++(int);
  2367. inline Vector &operator--();
  2368. inline Vector operator--(int);
  2369. #define Vc_OP(symbol) \
  2370. inline Vc_PURE Vector operator symbol(const Vector &x) const;
  2371. Vc_ALL_ARITHMETICS(Vc_OP);
  2372. Vc_ALL_BINARY(Vc_OP);
  2373. Vc_ALL_SHIFTS(Vc_OP);
  2374. #undef Vc_OP
  2375. #define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
  2376. Vc_ALL_COMPARES(Vc_CMP_OP);
  2377. #undef Vc_CMP_OP
  2378. inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);
  2379. inline EntryType min() const;
  2380. inline EntryType max() const;
  2381. inline EntryType product() const;
  2382. inline EntryType sum() const;
  2383. inline Vector partialSum() const;
  2384. inline EntryType min(MaskType mask) const;
  2385. inline EntryType max(MaskType mask) const;
  2386. inline EntryType product(MaskType mask) const;
  2387. inline EntryType sum(MaskType mask) const;
  2388. inline Vector shifted(int amount) const;
  2389. inline Vector shifted(int amount, Vector shiftIn) const;
  2390. inline Vector rotated(int amount) const;
  2391. inline Vector reversed() const;
  2392. inline Vector sorted() const;
  2393. template <typename F> void callWithValuesSorted(F &&f);
  2394. template <typename F> inline void call(F &&f) const;
  2395. template <typename F> inline void call(F &&f, MaskType mask) const;
  2396. template <typename F> inline Vector apply(F &&f) const;
  2397. template <typename F> inline Vector apply(F &&f, MaskType mask) const;
  2398. template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
  2399. inline void fill(EntryType(&f)());
  2400. inline Vector interleaveLow(Vector x) const;
  2401. inline Vector interleaveHigh(Vector x) const;
  2402. inline void assign(const Vector &v, const MaskType &m);
  2403. inline VectorType &data();
  2404. inline const VectorType &data() const;
  2405. Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;
  2406. Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;
  2407. static constexpr size_t Size = VectorTraits<T, Abi>::size();
  2408. template <typename V2> inline V2 staticCast() const;
  2409. template <typename V2>
  2410. Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
  2411. reinterpretCast() const;
  2412. Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
  2413. copySign(Vector reference) const;
  2414. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));
  2415. private:
  2416. VectorType d;
  2417. };
  2418. template <typename V, typename T, typename Abi>
  2419. Vc_ALWAYS_INLINE Vc_CONST enable_if<
  2420. (V::size() == Vector<T, Abi>::size() &&
  2421. sizeof(typename V::VectorEntryType) ==
  2422. sizeof(typename Vector<T, Abi>::VectorEntryType) &&
  2423. sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
  2424. V>
  2425. reinterpret_components_cast(const Vector<T, Abi> &x)
  2426. {
  2427. return reinterpret_cast<const V &>(x);
  2428. }
  2429. #define Vc_OP(symbol) \
  2430. template <typename T, typename Abi> \
  2431. inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &, \
  2432. const Vector<T, Abi> &x);
  2433. #undef Vc_OP
  2434. }
  2435. #endif
  2436. #ifndef VC_COMMON_MASK_H_
  2437. #define VC_COMMON_MASK_H_
  2438. namespace Vc_VERSIONED_NAMESPACE
  2439. {
  2440. template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
  2441. {
  2442. public:
  2443. static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
  2444. static constexpr size_t Size = VectorTraits<T, Abi>::size();
  2445. static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();
  2446. using abi = Abi;
  2447. using EntryType = bool;
  2448. using value_type = EntryType;
  2449. using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
  2450. using value_reference = EntryReference;
  2451. using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
  2452. using VectorType = typename VectorTraits<T, Abi>::VectorType;
  2453. using vector_type = VectorType;
  2454. Vc_INTRINSIC static Mask Zero();
  2455. Vc_INTRINSIC static Mask One();
  2456. template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
  2457. Vc_INTRINSIC Mask() = default;
  2458. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);
  2459. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
  2460. Vc_INTRINSIC explicit Mask(bool b);
  2461. template <typename U>
  2462. Vc_INTRINSIC Mask(U &&otherMask,
  2463. Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg);
  2464. #if Vc_IS_VERSION_1
  2465. template <typename U>
  2466. Vc_DEPRECATED(
  2467. "use simd_cast instead of explicit type casting to convert between mask types")
  2468. Vc_INTRINSIC_L
  2469. explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
  2470. nullarg) Vc_INTRINSIC_R;
  2471. #endif
  2472. Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
  2473. template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);
  2474. Vc_ALWAYS_INLINE void load(const bool *mem);
  2475. template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);
  2476. Vc_ALWAYS_INLINE void store(bool *mem) const;
  2477. template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
  2478. Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;
  2479. Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
  2480. Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
  2481. Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
  2482. Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
  2483. Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
  2484. Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
  2485. Vc_ALWAYS_INLINE Mask operator!() const;
  2486. Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
  2487. Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
  2488. Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
  2489. Vc_ALWAYS_INLINE bool isFull() const;
  2490. Vc_ALWAYS_INLINE bool isNotEmpty() const;
  2491. Vc_ALWAYS_INLINE bool isEmpty() const;
  2492. Vc_ALWAYS_INLINE bool isMix() const;
  2493. Vc_ALWAYS_INLINE bool data() const;
  2494. Vc_ALWAYS_INLINE bool dataI() const;
  2495. Vc_ALWAYS_INLINE bool dataD() const;
  2496. Vc_ALWAYS_INLINE EntryReference operator[](size_t index);
  2497. Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
  2498. Vc_ALWAYS_INLINE int count() const;
  2499. Vc_ALWAYS_INLINE int firstOne() const;
  2500. Vc_ALWAYS_INLINE int toInt() const;
  2501. Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;
  2502. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
  2503. private:
  2504. VectorType d;
  2505. };
  2506. template<typename Mask> constexpr bool all_of(const Mask &m) { return m.isFull(); }
  2507. constexpr bool all_of(bool b) { return b; }
  2508. template<typename Mask> constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); }
  2509. constexpr bool any_of(bool b) { return b; }
  2510. template<typename Mask> constexpr bool none_of(const Mask &m) { return m.isEmpty(); }
  2511. constexpr bool none_of(bool b) { return !b; }
  2512. template<typename Mask> constexpr bool some_of(const Mask &m) { return m.isMix(); }
  2513. constexpr bool some_of(bool) { return false; }
  2514. }
  2515. #endif
  2516. #ifndef VC_COMMON_MEMORYFWD_H_
  2517. #define VC_COMMON_MEMORYFWD_H_
  2518. namespace Vc_VERSIONED_NAMESPACE
  2519. {
  2520. namespace Common
  2521. {
  2522. template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
  2523. bool InitPadding = true>
  2524. class Memory;
  2525. template <typename V, typename Parent, int Dimension, typename RowMemory>
  2526. class MemoryBase;
  2527. }
  2528. using Common::Memory;
  2529. }
  2530. #endif
  2531. #endif
  2532. #ifndef VC_SCALAR_TYPES_H_
  2533. #define VC_SCALAR_TYPES_H_
  2534. #ifdef Vc_DEFAULT_IMPL_Scalar
  2535. #define Vc_DOUBLE_V_SIZE 1
  2536. #define Vc_FLOAT_V_SIZE 1
  2537. #define Vc_INT_V_SIZE 1
  2538. #define Vc_UINT_V_SIZE 1
  2539. #define Vc_SHORT_V_SIZE 1
  2540. #define Vc_USHORT_V_SIZE 1
  2541. #endif
  2542. namespace Vc_VERSIONED_NAMESPACE
  2543. {
  2544. namespace Scalar
  2545. {
  2546. template <typename T> using Vector = Vc::Vector<T, VectorAbi::Scalar>;
  2547. typedef Vector<double> double_v;
  2548. typedef Vector<float> float_v;
  2549. typedef Vector<int> int_v;
  2550. typedef Vector<unsigned int> uint_v;
  2551. typedef Vector<short> short_v;
  2552. typedef Vector<unsigned short> ushort_v;
  2553. template <typename T> using Mask = Vc::Mask<T, VectorAbi::Scalar>;
  2554. typedef Mask<double> double_m;
  2555. typedef Mask<float> float_m;
  2556. typedef Mask<int> int_m;
  2557. typedef Mask<unsigned int> uint_m;
  2558. typedef Mask<short> short_m;
  2559. typedef Mask<unsigned short> ushort_m;
  2560. template <typename T> struct is_vector : public std::false_type {};
  2561. template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
  2562. template <typename T> struct is_mask : public std::false_type {};
  2563. template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
  2564. }
  2565. namespace Traits
  2566. {
  2567. template <typename T> struct is_simd_mask_internal<Scalar::Mask<T>>
  2568. : public std::true_type {};
  2569. template <class T> struct
  2570. is_simd_vector_internal<Vector<T, VectorAbi::Scalar>>
  2571. : public is_valid_vector_argument<T> {};
  2572. }
  2573. }
  2574. #endif
  2575. #ifndef VC_SCALAR_DETAIL_H_
  2576. #define VC_SCALAR_DETAIL_H_
  2577. #ifndef VC_SCALAR_MACROS_H_
  2578. #define VC_SCALAR_MACROS_H_
  2579. #endif
  2580. namespace Vc_VERSIONED_NAMESPACE
  2581. {
  2582. namespace Detail
  2583. {
  2584. template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
  2585. template<typename V, size_t VSize> struct InterleaveImpl<V, 1, VSize> {
  2586. template <typename I>
  2587. static inline void interleave(typename V::EntryType *const data, const I &i,
  2588. const typename V::AsArg v0, const typename V::AsArg v1)
  2589. {
  2590. data[i[0] + 0] = v0.data();
  2591. data[i[0] + 1] = v1.data();
  2592. }
  2593. template <typename I>
  2594. static inline void interleave(typename V::EntryType *const data, const I &i,
  2595. const typename V::AsArg v0, const typename V::AsArg v1,
  2596. const typename V::AsArg v2)
  2597. {
  2598. data[i[0] + 0] = v0.data();
  2599. data[i[0] + 1] = v1.data();
  2600. data[i[0] + 2] = v2.data();
  2601. }
  2602. template <typename I>
  2603. static inline void interleave(typename V::EntryType *const data, const I &i,
  2604. const typename V::AsArg v0, const typename V::AsArg v1,
  2605. const typename V::AsArg v2, const typename V::AsArg v3)
  2606. {
  2607. data[i[0] + 0] = v0.data();
  2608. data[i[0] + 1] = v1.data();
  2609. data[i[0] + 2] = v2.data();
  2610. data[i[0] + 3] = v3.data();
  2611. }
  2612. template <typename I>
  2613. static inline void interleave(typename V::EntryType *const data, const I &i,
  2614. const typename V::AsArg v0, const typename V::AsArg v1,
  2615. const typename V::AsArg v2, const typename V::AsArg v3,
  2616. const typename V::AsArg v4)
  2617. {
  2618. interleave(data, i, v0, v1, v2, v3);
  2619. data[i[0] + 4] = v4.data();
  2620. }
  2621. template <typename I>
  2622. static inline void interleave(typename V::EntryType *const data, const I &i,
  2623. const typename V::AsArg v0, const typename V::AsArg v1,
  2624. const typename V::AsArg v2, const typename V::AsArg v3,
  2625. const typename V::AsArg v4, const typename V::AsArg v5)
  2626. {
  2627. interleave(data, i, v0, v1, v2, v3);
  2628. interleave(data + 4, i, v4, v5);
  2629. }
  2630. template <typename I>
  2631. static inline void interleave(typename V::EntryType *const data, const I &i,
  2632. const typename V::AsArg v0, const typename V::AsArg v1,
  2633. const typename V::AsArg v2, const typename V::AsArg v3,
  2634. const typename V::AsArg v4, const typename V::AsArg v5,
  2635. const typename V::AsArg v6)
  2636. {
  2637. interleave(data, i, v0, v1, v2, v3);
  2638. interleave(data + 4, i, v4, v5, v6);
  2639. }
  2640. template <typename I>
  2641. static inline void interleave(typename V::EntryType *const data, const I &i,
  2642. const typename V::AsArg v0, const typename V::AsArg v1,
  2643. const typename V::AsArg v2, const typename V::AsArg v3,
  2644. const typename V::AsArg v4, const typename V::AsArg v5,
  2645. const typename V::AsArg v6, const typename V::AsArg v7)
  2646. {
  2647. interleave(data, i, v0, v1, v2, v3);
  2648. interleave(data + 4, i, v4, v5, v6, v7);
  2649. }
  2650. template <typename I>
  2651. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2652. V &v0, V &v1)
  2653. {
  2654. v0.data() = data[i[0] + 0];
  2655. v1.data() = data[i[0] + 1];
  2656. }
  2657. template <typename I>
  2658. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2659. V &v0, V &v1, V &v2)
  2660. {
  2661. v0.data() = data[i[0] + 0];
  2662. v1.data() = data[i[0] + 1];
  2663. v2.data() = data[i[0] + 2];
  2664. }
  2665. template <typename I>
  2666. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2667. V &v0, V &v1, V &v2, V &v3)
  2668. {
  2669. v0.data() = data[i[0] + 0];
  2670. v1.data() = data[i[0] + 1];
  2671. v2.data() = data[i[0] + 2];
  2672. v3.data() = data[i[0] + 3];
  2673. }
  2674. template <typename I>
  2675. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2676. V &v0, V &v1, V &v2, V &v3, V &v4)
  2677. {
  2678. deinterleave(data, i, v0, v1, v2, v3);
  2679. v4.data() = data[i[0] + 4];
  2680. }
  2681. template <typename I>
  2682. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2683. V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  2684. {
  2685. deinterleave(data, i, v0, v1, v2, v3);
  2686. deinterleave(data + 4, i, v4, v5);
  2687. }
  2688. template <typename I>
  2689. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2690. V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  2691. {
  2692. deinterleave(data, i, v0, v1, v2, v3);
  2693. deinterleave(data + 4, i, v4, v5, v6);
  2694. }
  2695. template <typename I>
  2696. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  2697. V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6,
  2698. V &v7)
  2699. {
  2700. deinterleave(data, i, v0, v1, v2, v3);
  2701. deinterleave(data + 4, i, v4, v5, v6, v7);
  2702. }
  2703. };
  2704. }
  2705. }
  2706. #endif
  2707. #ifndef VC_SCALAR_MASK_H_
  2708. #define VC_SCALAR_MASK_H_
  2709. namespace Vc_VERSIONED_NAMESPACE
  2710. {
  2711. template <typename T> class Mask<T, VectorAbi::Scalar>
  2712. {
  2713. friend class Mask< double, VectorAbi::Scalar>;
  2714. friend class Mask< float, VectorAbi::Scalar>;
  2715. friend class Mask< int32_t, VectorAbi::Scalar>;
  2716. friend class Mask<uint32_t, VectorAbi::Scalar>;
  2717. friend class Mask< int16_t, VectorAbi::Scalar>;
  2718. friend class Mask<uint16_t, VectorAbi::Scalar>;
  2719. public:
  2720. using abi = VectorAbi::Scalar;
  2721. static constexpr size_t Size = 1;
  2722. static constexpr size_t MemoryAlignment = 1;
  2723. static constexpr std::size_t size() { return 1; }
  2724. typedef bool EntryType;
  2725. using value_type = EntryType;
  2726. using EntryReference = Vc::Detail::ElementReference<Mask>;
  2727. using reference = EntryReference;
  2728. typedef bool VectorEntryType;
  2729. using VectorType = bool;
  2730. using Vector = Scalar::Vector<T>;
  2731. Vc_INTRINSIC Mask() = default;
  2732. Vc_INTRINSIC explicit Mask(bool b) : m(b) {}
  2733. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : m(false) {}
  2734. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : m(true) {}
  2735. Vc_INTRINSIC static Mask Zero() { return Mask(false); }
  2736. Vc_INTRINSIC static Mask One() { return Mask(true); }
  2737. template <typename U>
  2738. Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
  2739. : m(rhs.m) {}
  2740. #if Vc_IS_VERSION_1
  2741. template <typename U>
  2742. Vc_DEPRECATED(
  2743. "use simd_cast instead of explicit type casting to convert between mask types")
  2744. Vc_INTRINSIC_L
  2745. explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U> = nullarg)
  2746. Vc_INTRINSIC_R;
  2747. #endif
  2748. Vc_ALWAYS_INLINE explicit Mask(const bool *mem) : m(mem[0]) {}
  2749. template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags) : m(mem[0]) {}
  2750. Vc_ALWAYS_INLINE void load(const bool *mem) { m = mem[0]; }
  2751. template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { m = mem[0]; }
  2752. Vc_ALWAYS_INLINE void store(bool *mem) const { *mem = m; }
  2753. template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { *mem = m; }
  2754. Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return m == rhs.m; }
  2755. Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return m != rhs.m; }
  2756. Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); }
  2757. Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); }
  2758. Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); }
  2759. Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); }
  2760. Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); }
  2761. Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); }
  2762. Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; }
  2763. Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; }
  2764. Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; }
  2765. Vc_ALWAYS_INLINE bool isFull () const { return m; }
  2766. Vc_ALWAYS_INLINE bool isNotEmpty() const { return m; }
  2767. Vc_ALWAYS_INLINE bool isEmpty() const { return !m; }
  2768. Vc_ALWAYS_INLINE bool isMix () const { return false; }
  2769. Vc_ALWAYS_INLINE bool data () const { return m; }
  2770. Vc_ALWAYS_INLINE bool dataI() const { return m; }
  2771. Vc_ALWAYS_INLINE bool dataD() const { return m; }
  2772. private:
  2773. friend reference;
  2774. static Vc_INTRINSIC bool get(const Mask &o, int) noexcept { return o.m; }
  2775. template <typename U>
  2776. static Vc_INTRINSIC void set(Mask &o, int, U &&v) noexcept(
  2777. noexcept(std::declval<bool &>() = std::declval<U>()))
  2778. {
  2779. o.m = std::forward<U>(v);
  2780. }
  2781. public:
  2782. Vc_ALWAYS_INLINE reference operator[](size_t i) noexcept
  2783. {
  2784. Vc_ASSERT(i == 0); if (i) {}
  2785. return {*this, 0};
  2786. }
  2787. Vc_ALWAYS_INLINE value_type operator[](size_t i) const noexcept
  2788. {
  2789. Vc_ASSERT(i == 0); if (i) {}
  2790. return m;
  2791. }
  2792. Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; }
  2793. Vc_ALWAYS_INLINE int firstOne() const { return 0; }
  2794. Vc_ALWAYS_INLINE int toInt() const { return m ? 1 : 0; }
  2795. template <typename G> static Vc_INTRINSIC Mask generate(G &&gen)
  2796. {
  2797. return Mask(gen(0));
  2798. }
  2799. Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const
  2800. {
  2801. if (amount == 0) {
  2802. return *this;
  2803. } else {
  2804. return Zero();
  2805. }
  2806. }
  2807. private:
  2808. bool m;
  2809. };
  2810. template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::Size;
  2811. template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::MemoryAlignment;
  2812. }
  2813. #endif
  2814. namespace Vc_VERSIONED_NAMESPACE
  2815. {
  2816. #define Vc_CURRENT_CLASS_NAME Vector
  2817. template <typename T> class Vector<T, VectorAbi::Scalar>
  2818. {
  2819. static_assert(std::is_arithmetic<T>::value,
  2820. "Vector<T> only accepts arithmetic builtin types as template parameter T.");
  2821. public:
  2822. using abi = VectorAbi::Scalar;
  2823. using EntryType = T;
  2824. using VectorEntryType = EntryType;
  2825. using value_type = EntryType;
  2826. using VectorType = EntryType;
  2827. using vector_type = VectorType;
  2828. using reference = Detail::ElementReference<Vector>;
  2829. protected:
  2830. VectorType m_data = VectorType();
  2831. template <typename U> using V = Vector<U, abi>;
  2832. public:
  2833. typedef Scalar::Mask<T> Mask;
  2834. using MaskType = Mask;
  2835. using mask_type = Mask;
  2836. typedef Mask MaskArgument;
  2837. typedef Vector AsArg;
  2838. Vc_ALWAYS_INLINE VectorType &data() { return m_data; }
  2839. Vc_ALWAYS_INLINE const VectorType &data() const { return m_data; }
  2840. static constexpr size_t Size = 1;
  2841. static constexpr size_t MemoryAlignment = alignof(VectorType);
  2842. using IndexType = fixed_size_simd<int, 1>;
  2843. public:
  2844. Vc_INTRINSIC Vector() = default;
  2845. static constexpr std::size_t size() { return Size; }
  2846. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
  2847. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
  2848. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
  2849. static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
  2850. static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
  2851. static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
  2852. {
  2853. return Vector(Vc::IndexesFromZero);
  2854. }
  2855. template <class G, int = 0,
  2856. class = typename std::enable_if<std::is_convertible<
  2857. decltype(std::declval<G>()(size_t())), value_type>::value>::type>
  2858. explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
  2859. {
  2860. }
  2861. static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
  2862. template <typename U>
  2863. Vc_INTRINSIC Vector(
  2864. V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
  2865. void *>::type = nullptr)
  2866. : m_data(static_cast<EntryType>(x.data()))
  2867. {
  2868. }
  2869. #if Vc_IS_VERSION_1
  2870. template <typename U>
  2871. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  2872. "vector types") Vc_INTRINSIC
  2873. explicit Vector(
  2874. V<U> x,
  2875. typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
  2876. void *>::type = nullptr)
  2877. : m_data(static_cast<EntryType>(x.data()))
  2878. {
  2879. }
  2880. #endif
  2881. Vc_INTRINSIC Vector(EntryType a) : m_data(a) {}
  2882. template <typename U>
  2883. Vc_INTRINSIC Vector(U a,
  2884. typename std::enable_if<std::is_same<U, int>::value &&
  2885. !std::is_same<U, EntryType>::value,
  2886. void *>::type = nullptr)
  2887. : Vector(static_cast<EntryType>(a))
  2888. {
  2889. }
  2890. explicit Vc_INTRINSIC Vector(const EntryType *mem)
  2891. {
  2892. load(mem);
  2893. }
  2894. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2895. explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
  2896. {
  2897. load(mem, flags);
  2898. }
  2899. template <typename U, typename Flags = DefaultLoadTag,
  2900. typename = enable_if<
  2901. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  2902. sizeof(EntryType) >= sizeof(U)) &&
  2903. std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2904. explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
  2905. {
  2906. load<U, Flags>(x, flags);
  2907. }
  2908. Vc_INTRINSIC void load(const EntryType *mem)
  2909. {
  2910. load(mem, DefaultLoadTag());
  2911. }
  2912. template <typename Flags>
  2913. Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
  2914. load(const EntryType *mem, Flags flags)
  2915. {
  2916. load<EntryType, Flags>(mem, flags);
  2917. }
  2918. private:
  2919. template <typename U, typename Flags>
  2920. struct load_concept : public std::enable_if<
  2921. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  2922. sizeof(EntryType) >= sizeof(U)) &&
  2923. std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
  2924. {};
  2925. public:
  2926. template <typename U, typename Flags = DefaultLoadTag>
  2927. Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
  2928. template <
  2929. typename U,
  2930. typename Flags = DefaultStoreTag,
  2931. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2932. Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
  2933. template <
  2934. typename U,
  2935. typename Flags = DefaultStoreTag,
  2936. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  2937. Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
  2938. Vc_INTRINSIC void store(EntryType *mem) const
  2939. {
  2940. store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
  2941. }
  2942. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2943. Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
  2944. {
  2945. store<EntryType, Flags>(mem, flags);
  2946. }
  2947. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
  2948. {
  2949. store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
  2950. }
  2951. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  2952. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
  2953. {
  2954. store<EntryType, Flags>(mem, mask, flags);
  2955. }
  2956. Vc_ALWAYS_INLINE void setZero() { m_data = 0; }
  2957. Vc_ALWAYS_INLINE void setZero(Mask k) { if (k.data()) m_data = 0; }
  2958. Vc_ALWAYS_INLINE void setZeroInverted(Mask k) { if (!k.data()) m_data = 0; }
  2959. Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
  2960. Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R;
  2961. #ifndef Vc_CURRENT_CLASS_NAME
  2962. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  2963. #endif
  2964. private:
  2965. template <class MT, class IT, int Scale = 1>
  2966. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  2967. template <class MT, class IT, int Scale = 1>
  2968. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  2969. MaskArgument mask);
  2970. public:
  2971. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  2972. static_assert( \
  2973. std::is_convertible<MT, EntryType>::value, \
  2974. "The memory pointer needs to point to a type that can be converted to the " \
  2975. "EntryType of this SIMD vector type."); \
  2976. static_assert( \
  2977. Vc::Traits::has_subscript_operator<IT>::value, \
  2978. "The indexes argument must be a type that implements the subscript operator."); \
  2979. static_assert( \
  2980. !Traits::is_simd_vector<IT>::value || \
  2981. Traits::simd_vector_size<IT>::value >= Size, \
  2982. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  2983. "have at least as many entries as this SIMD vector."); \
  2984. static_assert( \
  2985. !std::is_array<T>::value || \
  2986. (std::rank<T>::value == 1 && \
  2987. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  2988. "If you use a simple array for the indexes parameter, the array must have " \
  2989. "at least as many entries as this SIMD vector.")
  2990. template <typename MT, typename IT,
  2991. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  2992. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  2993. {
  2994. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  2995. gatherImplementation(
  2996. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  2997. }
  2998. template <class MT, class IT, int Scale>
  2999. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  3000. {
  3001. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3002. gatherImplementation(args);
  3003. }
  3004. template <typename MT, typename IT,
  3005. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  3006. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  3007. MaskArgument mask)
  3008. {
  3009. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3010. gatherImplementation(
  3011. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  3012. }
  3013. template <class MT, class IT, int Scale>
  3014. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  3015. MaskArgument mask)
  3016. {
  3017. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3018. gatherImplementation(args, mask);
  3019. }
  3020. template <typename MT, typename IT,
  3021. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  3022. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  3023. {
  3024. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3025. gatherImplementation(
  3026. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  3027. }
  3028. template <typename MT, typename IT,
  3029. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  3030. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  3031. {
  3032. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3033. gatherImplementation(
  3034. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  3035. }
  3036. template <class MT, class IT, int Scale>
  3037. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  3038. {
  3039. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3040. gatherImplementation(args);
  3041. }
  3042. template <class MT, class IT, int Scale>
  3043. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  3044. MaskArgument mask)
  3045. {
  3046. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  3047. gatherImplementation(args, mask);
  3048. }
  3049. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  3050. private:
  3051. template <typename MT, typename IT>
  3052. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  3053. template <typename MT, typename IT>
  3054. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  3055. public:
  3056. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  3057. static_assert( \
  3058. std::is_convertible<EntryType, MT>::value, \
  3059. "The memory pointer needs to point to a type that the EntryType of this " \
  3060. "SIMD vector type can be converted to."); \
  3061. static_assert( \
  3062. Vc::Traits::has_subscript_operator<IT>::value, \
  3063. "The indexes argument must be a type that implements the subscript operator."); \
  3064. static_assert( \
  3065. !Traits::is_simd_vector<IT>::value || \
  3066. Traits::simd_vector_size<IT>::value >= Size, \
  3067. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  3068. "have at least as many entries as this SIMD vector."); \
  3069. static_assert( \
  3070. !std::is_array<T>::value || \
  3071. (std::rank<T>::value == 1 && \
  3072. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  3073. "If you use a simple array for the indexes parameter, the array must have " \
  3074. "at least as many entries as this SIMD vector.")
  3075. template <typename MT,
  3076. typename IT,
  3077. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  3078. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  3079. {
  3080. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  3081. scatterImplementation(mem, std::forward<IT>(indexes));
  3082. }
  3083. template <typename MT,
  3084. typename IT,
  3085. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  3086. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  3087. {
  3088. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  3089. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  3090. }
  3091. template <typename MT, typename IT>
  3092. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  3093. {
  3094. scatter(args.address, args.indexes);
  3095. }
  3096. template <typename MT, typename IT>
  3097. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  3098. {
  3099. scatter(args.address, args.indexes, mask);
  3100. }
  3101. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  3102. Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; }
  3103. Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; }
  3104. Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; }
  3105. Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; }
  3106. private:
  3107. friend reference;
  3108. Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
  3109. {
  3110. Vc_ASSERT(i == 0); if (i) {}
  3111. return o.m_data;
  3112. }
  3113. template <typename U>
  3114. Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
  3115. noexcept(std::declval<value_type &>() = v))
  3116. {
  3117. Vc_ASSERT(i == 0); if (i) {}
  3118. o.m_data = v;
  3119. }
  3120. public:
  3121. Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
  3122. {
  3123. static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
  3124. return {*this, int(index)};
  3125. }
  3126. Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
  3127. {
  3128. Vc_ASSERT(index == 0); if (index) {}
  3129. return m_data;
  3130. }
  3131. Vc_ALWAYS_INLINE Mask operator!() const
  3132. {
  3133. return Mask(!m_data);
  3134. }
  3135. Vc_ALWAYS_INLINE Vector operator~() const
  3136. {
  3137. #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
  3138. static_assert(std::is_integral<T>::value, "bit-complement can only be used with Vectors of integral type");
  3139. #endif
  3140. return Vector(~m_data);
  3141. }
  3142. Vc_ALWAYS_INLINE Vector operator-() const
  3143. {
  3144. return -m_data;
  3145. }
  3146. Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; }
  3147. #define Vc_OP(symbol) \
  3148. Vc_ALWAYS_INLINE Vc_PURE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); }
  3149. Vc_ALL_SHIFTS(Vc_OP);
  3150. #undef Vc_OP
  3151. Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
  3152. isNegative() const
  3153. {
  3154. return Vc::isnegative(*this);
  3155. }
  3156. Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) {
  3157. if (m.data()) m_data = v.m_data;
  3158. }
  3159. template <typename V2>
  3160. Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
  3161. staticCast() const
  3162. {
  3163. return V2(static_cast<typename V2::EntryType>(m_data));
  3164. }
  3165. template <typename V2>
  3166. Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
  3167. reinterpretCast() const
  3168. {
  3169. typedef typename V2::EntryType AliasT2 Vc_MAY_ALIAS;
  3170. return V2(*reinterpret_cast<const AliasT2 *>(&m_data));
  3171. }
  3172. Vc_ALWAYS_INLINE Common::WriteMaskedVector<Vector, Mask> operator()(Mask m)
  3173. {
  3174. return {*this, m};
  3175. }
  3176. Vc_ALWAYS_INLINE EntryType min() const { return m_data; }
  3177. Vc_ALWAYS_INLINE EntryType max() const { return m_data; }
  3178. Vc_ALWAYS_INLINE EntryType product() const { return m_data; }
  3179. Vc_ALWAYS_INLINE EntryType sum() const { return m_data; }
  3180. Vc_ALWAYS_INLINE Vector partialSum() const { return *this; }
  3181. Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; }
  3182. Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; }
  3183. Vc_ALWAYS_INLINE EntryType product(Mask m) const
  3184. {
  3185. if (m.data()) {
  3186. return m_data;
  3187. } else {
  3188. return EntryType(1);
  3189. }
  3190. }
  3191. Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m.data()) return m_data; return static_cast<EntryType>(0); }
  3192. Vc_INTRINSIC Vector Vc_VDECL shifted(int amount, Vector shiftIn) const {
  3193. Vc_ASSERT(amount >= -1 && amount <= 1);
  3194. return amount == 0 ? *this : shiftIn;
  3195. }
  3196. Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); }
  3197. Vc_INTRINSIC Vector rotated(int) const { return *this; }
  3198. Vc_INTRINSIC Vector reversed() const { return *this; }
  3199. Vc_INTRINSIC Vector sorted() const { return *this; }
  3200. template <typename F> void callWithValuesSorted(F &&f) { f(m_data); }
  3201. template <typename F> Vc_INTRINSIC void call(F &&f) const { f(m_data); }
  3202. template <typename F> Vc_INTRINSIC void call(F &&f, Mask mask) const
  3203. {
  3204. if (mask.data()) {
  3205. f(m_data);
  3206. }
  3207. }
  3208. template <typename F> Vc_INTRINSIC Vector apply(F &&f) const { return Vector(f(m_data)); }
  3209. template <typename F> Vc_INTRINSIC Vector apply(F &&f, Mask mask) const
  3210. {
  3211. if (mask.data()) {
  3212. return Vector(f(m_data));
  3213. } else {
  3214. return *this;
  3215. }
  3216. }
  3217. template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
  3218. m_data = f(0);
  3219. }
  3220. Vc_INTRINSIC void fill(EntryType (&f)()) {
  3221. m_data = f();
  3222. }
  3223. template <typename G> static Vc_INTRINSIC Vector generate(G gen)
  3224. {
  3225. return gen(0);
  3226. }
  3227. Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector Vc_VDECL
  3228. copySign(Vector x) const
  3229. {
  3230. return Vc::copysign(*this, x);
  3231. }
  3232. Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
  3233. {
  3234. return Vc::exponent(*this);
  3235. }
  3236. Vc_INTRINSIC Vector Vc_VDECL interleaveLow(Vector) const { return *this; }
  3237. Vc_INTRINSIC Vector Vc_VDECL interleaveHigh(Vector x) const { return x; }
  3238. };
  3239. #undef Vc_CURRENT_CLASS_NAME
  3240. template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::Size;
  3241. template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::MemoryAlignment;
  3242. #define Vc_OP(symbol) \
  3243. template <typename T, typename U, \
  3244. typename = decltype(std::declval<T &>() symbol## = std::declval<T>())> \
  3245. Vc_INTRINSIC enable_if<std::is_convertible<U, Vector<T, VectorAbi::Scalar>>::value, \
  3246. Vector<T, VectorAbi::Scalar>> \
  3247. &operator symbol##=(Vector<T, VectorAbi::Scalar> &lhs, U &&rhs) \
  3248. { \
  3249. lhs.data() symbol## = Vector<T, VectorAbi::Scalar>(std::forward<U>(rhs)).data(); \
  3250. return lhs; \
  3251. }
  3252. Vc_ALL_SHIFTS(Vc_OP);
  3253. #undef Vc_OP
  3254. #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
  3255. template <Operator O, typename T, typename M, typename U> \
  3256. Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
  3257. Vector<T, VectorAbi::Scalar> &lhs, M &&mask, U &&rhs) \
  3258. { \
  3259. if (mask.isFull()) { \
  3260. lhs op_ std::forward<U>(rhs); \
  3261. } \
  3262. } \
  3263. Vc_NOTHING_EXPECTING_SEMICOLON
  3264. Vc_CONDITIONAL_ASSIGN( Assign, =);
  3265. Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
  3266. Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
  3267. Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
  3268. Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
  3269. Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
  3270. Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
  3271. Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
  3272. Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
  3273. Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
  3274. Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
  3275. #undef Vc_CONDITIONAL_ASSIGN
  3276. #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
  3277. template <Operator O, typename T, typename M> \
  3278. Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Scalar>> \
  3279. conditional_assign(Vector<T, VectorAbi::Scalar> &lhs, M &&mask) \
  3280. { \
  3281. return mask.isFull() ? (expr_) : lhs; \
  3282. } \
  3283. Vc_NOTHING_EXPECTING_SEMICOLON
  3284. Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs++);
  3285. Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs);
  3286. Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs--);
  3287. Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs);
  3288. #undef Vc_CONDITIONAL_ASSIGN
  3289. }
  3290. #include <cmath>
  3291. #ifndef VC_COMMON_CONST_DATA_H_
  3292. #define VC_COMMON_CONST_DATA_H_
  3293. namespace Vc_VERSIONED_NAMESPACE
  3294. {
  3295. namespace Common
  3296. {
  3297. alignas(64) extern unsigned int RandomState[];
  3298. alignas(32) extern const unsigned int AllBitsSet[8];
  3299. }
  3300. }
  3301. #endif
  3302. #ifndef VC_COMMON_WHERE_H_
  3303. #define VC_COMMON_WHERE_H_
  3304. namespace Vc_VERSIONED_NAMESPACE
  3305. {
  3306. namespace WhereImpl
  3307. {
  3308. template<typename _Mask, typename _LValue> struct MaskedLValue
  3309. {
  3310. typedef _Mask Mask;
  3311. typedef _LValue LValue;
  3312. const Mask &mask;
  3313. LValue &lhs;
  3314. constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
  3315. MaskedLValue(const MaskedLValue &) = delete;
  3316. #ifndef __cpp_guaranteed_copy_elision
  3317. constexpr MaskedLValue(MaskedLValue &&) = default;
  3318. #endif
  3319. template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { conditional_assign<Operator:: Assign>(lhs, mask, std::forward<T>(rhs)); }
  3320. template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { conditional_assign<Operator:: PlusAssign>(lhs, mask, std::forward<T>(rhs)); }
  3321. template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { conditional_assign<Operator:: MinusAssign>(lhs, mask, std::forward<T>(rhs)); }
  3322. template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { conditional_assign<Operator:: MultiplyAssign>(lhs, mask, std::forward<T>(rhs)); }
  3323. template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { conditional_assign<Operator:: DivideAssign>(lhs, mask, std::forward<T>(rhs)); }
  3324. template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { conditional_assign<Operator:: RemainderAssign>(lhs, mask, std::forward<T>(rhs)); }
  3325. template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { conditional_assign<Operator:: XorAssign>(lhs, mask, std::forward<T>(rhs)); }
  3326. template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { conditional_assign<Operator:: AndAssign>(lhs, mask, std::forward<T>(rhs)); }
  3327. template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { conditional_assign<Operator:: OrAssign>(lhs, mask, std::forward<T>(rhs)); }
  3328. template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { conditional_assign<Operator:: LeftShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
  3329. template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { conditional_assign<Operator::RightShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
  3330. Vc_ALWAYS_INLINE void operator++() { conditional_assign<Operator:: PreIncrement>(lhs, mask); }
  3331. Vc_ALWAYS_INLINE void operator++(int) { conditional_assign<Operator::PostIncrement>(lhs, mask); }
  3332. Vc_ALWAYS_INLINE void operator--() { conditional_assign<Operator:: PreDecrement>(lhs, mask); }
  3333. Vc_ALWAYS_INLINE void operator--(int) { conditional_assign<Operator::PostDecrement>(lhs, mask); }
  3334. template <class T, class IV, class S>
  3335. Vc_INTRINSIC void operator=(Common::SubscriptOperation<T, IV, S, true> &&rhs)
  3336. {
  3337. lhs.gather(std::move(rhs).gatherArguments(), mask);
  3338. }
  3339. template <class T, class IV, class S>
  3340. void operator+=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3341. template <class T, class IV, class S>
  3342. void operator-=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3343. template <class T, class IV, class S>
  3344. void operator*=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3345. template <class T, class IV, class S>
  3346. void operator/=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3347. template <class T, class IV, class S>
  3348. void operator%=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3349. template <class T, class IV, class S>
  3350. void operator^=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3351. template <class T, class IV, class S>
  3352. void operator&=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3353. template <class T, class IV, class S>
  3354. void operator|=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3355. template <class T, class IV, class S>
  3356. void operator<<=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3357. template <class T, class IV, class S>
  3358. void operator>>=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
  3359. };
  3360. template <typename _Mask, typename T_, typename I_, typename S_>
  3361. struct MaskedLValue<_Mask, Common::SubscriptOperation<T_, I_, S_, true>>
  3362. {
  3363. typedef _Mask Mask;
  3364. typedef Common::SubscriptOperation<T_, I_, S_, true> SO;
  3365. const Mask &mask;
  3366. SO &lhs;
  3367. template <typename T> using Decay = typename std::decay<T>::type;
  3368. constexpr MaskedLValue(const Mask &m, SO &&l) : mask(m), lhs(l) {}
  3369. MaskedLValue(const MaskedLValue &) = delete;
  3370. #ifndef __cpp_guaranteed_copy_elision
  3371. constexpr MaskedLValue(MaskedLValue &&) = default;
  3372. #endif
  3373. template <class T> Vc_ALWAYS_INLINE void operator=(T &&rhs) &&
  3374. {
  3375. std::forward<T>(rhs).scatter(std::move(lhs).scatterArguments(), mask);
  3376. }
  3377. };
  3378. template<typename _LValue> struct MaskedLValue<bool, _LValue>
  3379. {
  3380. typedef bool Mask;
  3381. typedef _LValue LValue;
  3382. const Mask &mask;
  3383. LValue &lhs;
  3384. constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
  3385. MaskedLValue(const MaskedLValue &) = delete;
  3386. constexpr MaskedLValue(MaskedLValue &&) = default;
  3387. template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { if (mask) lhs = std::forward<T>(rhs); }
  3388. template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { if (mask) lhs += std::forward<T>(rhs); }
  3389. template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { if (mask) lhs -= std::forward<T>(rhs); }
  3390. template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { if (mask) lhs *= std::forward<T>(rhs); }
  3391. template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { if (mask) lhs /= std::forward<T>(rhs); }
  3392. template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { if (mask) lhs %= std::forward<T>(rhs); }
  3393. template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { if (mask) lhs ^= std::forward<T>(rhs); }
  3394. template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { if (mask) lhs &= std::forward<T>(rhs); }
  3395. template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { if (mask) lhs |= std::forward<T>(rhs); }
  3396. template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { if (mask) lhs <<= std::forward<T>(rhs); }
  3397. template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { if (mask) lhs >>= std::forward<T>(rhs); }
  3398. Vc_ALWAYS_INLINE void operator++() { if (mask) ++lhs; }
  3399. Vc_ALWAYS_INLINE void operator++(int) { if (mask) lhs++; }
  3400. Vc_ALWAYS_INLINE void operator--() { if (mask) --lhs; }
  3401. Vc_ALWAYS_INLINE void operator--(int) { if (mask) lhs--; }
  3402. };
  3403. template<typename _Mask> struct WhereMask
  3404. {
  3405. typedef _Mask Mask;
  3406. const Mask &mask;
  3407. constexpr WhereMask(const Mask &m) : mask(m) {}
  3408. WhereMask(const WhereMask &) = delete;
  3409. template <typename T, typename I, typename S>
  3410. constexpr Vc_WARN_UNUSED_RESULT
  3411. MaskedLValue<Mask, Common::SubscriptOperation<T, I, S, true>>
  3412. operator|(Common::SubscriptOperation<T, I, S, true> &&lhs) const
  3413. {
  3414. static_assert(!std::is_const<T>::value,
  3415. "masked scatter to constant memory not possible.");
  3416. return {mask, std::move(lhs)};
  3417. }
  3418. template<typename T> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator|(T &&lhs) const
  3419. {
  3420. static_assert(std::is_lvalue_reference<T>::value, "Syntax error: Incorrect use of Vc::where. Maybe operator precedence got you by surprise. Examples of correct usage:\n"
  3421. " Vc::where(x < 2) | x += 1;\n"
  3422. " (Vc::where(x < 2) | x)++;\n"
  3423. " Vc::where(x < 2)(x) += 1;\n"
  3424. " Vc::where(x < 2)(x)++;\n"
  3425. );
  3426. return { mask, lhs };
  3427. }
  3428. template <class T,
  3429. class = decltype(std::declval<T>() = std::declval<const T &>())>
  3430. constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator()(T &&lhs) const
  3431. {
  3432. return operator|(std::forward<T>(lhs));
  3433. }
  3434. };
  3435. }
  3436. template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> where(const M &mask)
  3437. {
  3438. return { mask };
  3439. }
  3440. template <class M, class V>
  3441. constexpr Vc_WARN_UNUSED_RESULT WhereImpl::MaskedLValue<M, V> where(const M &mask,
  3442. V &value)
  3443. {
  3444. return {mask, value};
  3445. }
  3446. template <class M, class T, class IT, class Scale>
  3447. constexpr Vc_WARN_UNUSED_RESULT
  3448. WhereImpl::MaskedLValue<M, Common::SubscriptOperation<T, IT, Scale, true>>
  3449. where(const M &mask, Common::SubscriptOperation<T, IT, Scale, true> &&value)
  3450. {
  3451. return {mask, std::move(value)};
  3452. }
  3453. template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> _if(const M &m)
  3454. {
  3455. return { m };
  3456. }
  3457. }
  3458. #endif
  3459. #ifndef VC_COMMON_TRANSPOSE_H_
  3460. #define VC_COMMON_TRANSPOSE_H_
  3461. #include <tuple>
  3462. namespace Vc_VERSIONED_NAMESPACE
  3463. {
  3464. namespace Common
  3465. {
  3466. template <typename... Inputs> struct TransposeProxy
  3467. {
  3468. TransposeProxy(const Inputs &... inputs) : in{inputs...} {}
  3469. std::tuple<const Inputs &...> in;
  3470. };
  3471. template <int LhsLength, size_t RhsLength> struct TransposeTag {
  3472. };
  3473. }
  3474. template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
  3475. {
  3476. return {vs...};
  3477. }
  3478. }
  3479. #endif
  3480. #ifndef VC_SCALAR_OPERATORS_H_
  3481. #define VC_SCALAR_OPERATORS_H_
  3482. namespace Vc_VERSIONED_NAMESPACE
  3483. {
  3484. namespace Detail
  3485. {
  3486. #define Vc_OP(op_) \
  3487. template <typename T> \
  3488. Vc_INTRINSIC Scalar::Mask<T> operator op_(Scalar::Vector<T> a, Scalar::Vector<T> b) \
  3489. { \
  3490. return Scalar::Mask<T>(a.data() op_ b.data()); \
  3491. }
  3492. Vc_ALL_COMPARES(Vc_OP);
  3493. #undef Vc_OP
  3494. #define Vc_OP(symbol) \
  3495. template <typename T> \
  3496. Vc_INTRINSIC enable_if<std::is_integral<T>::value, Scalar::Vector<T>> \
  3497. operator symbol(Scalar::Vector<T> a, Scalar::Vector<T> b) \
  3498. { \
  3499. return a.data() symbol b.data(); \
  3500. } \
  3501. template <typename T> \
  3502. Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, Scalar::Vector<T>> \
  3503. operator symbol(Scalar::Vector<T> &lhs, Scalar::Vector<T> rhs) \
  3504. { \
  3505. using uinta = \
  3506. MayAlias<typename std::conditional<sizeof(T) == sizeof(int), unsigned int, \
  3507. unsigned long long>::type>; \
  3508. uinta *left = reinterpret_cast<uinta *>(&lhs.data()); \
  3509. const uinta *right = reinterpret_cast<const uinta *>(&rhs.data()); \
  3510. *left symbol## = *right; \
  3511. return lhs; \
  3512. }
  3513. Vc_ALL_BINARY(Vc_OP);
  3514. #undef Vc_OP
  3515. template <typename T>
  3516. Vc_INTRINSIC Scalar::Vector<T> operator+(Scalar::Vector<T> a, Scalar::Vector<T> b)
  3517. {
  3518. return a.data() + b.data();
  3519. }
  3520. template <typename T>
  3521. Vc_INTRINSIC Scalar::Vector<T> operator-(Scalar::Vector<T> a, Scalar::Vector<T> b)
  3522. {
  3523. return a.data() - b.data();
  3524. }
  3525. template <typename T>
  3526. Vc_INTRINSIC Scalar::Vector<T> operator*(Scalar::Vector<T> a, Scalar::Vector<T> b)
  3527. {
  3528. return a.data() * b.data();
  3529. }
  3530. template <typename T>
  3531. Vc_INTRINSIC Scalar::Vector<T> operator/(Scalar::Vector<T> a, Scalar::Vector<T> b)
  3532. {
  3533. return a.data() / b.data();
  3534. }
  3535. template <typename T>
  3536. Vc_INTRINSIC Scalar::Vector<T> operator%(Scalar::Vector<T> a, Scalar::Vector<T> b)
  3537. {
  3538. return a.data() % b.data();
  3539. }
  3540. }
  3541. }
  3542. #endif
  3543. namespace Vc_VERSIONED_NAMESPACE
  3544. {
  3545. template <typename T>
  3546. Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerZero)
  3547. : m_data(0)
  3548. {
  3549. }
  3550. template <typename T>
  3551. Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerOne)
  3552. : m_data(1)
  3553. {
  3554. }
  3555. template <typename T>
  3556. Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerIndexesFromZero)
  3557. : m_data(0)
  3558. {
  3559. }
  3560. template <typename T>
  3561. template <typename U, typename Flags>
  3562. Vc_INTRINSIC typename Vector<T, VectorAbi::Scalar>::
  3563. #ifndef Vc_MSVC
  3564. template
  3565. #endif
  3566. load_concept<U, Flags>::type Vector<T, VectorAbi::Scalar>::load(const U *mem, Flags)
  3567. {
  3568. m_data = mem[0];
  3569. }
  3570. template <typename T>
  3571. template <typename U, typename Flags, typename>
  3572. Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Flags) const
  3573. {
  3574. mem[0] = m_data;
  3575. }
  3576. template <typename T>
  3577. template <typename U, typename Flags, typename>
  3578. Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Mask mask, Flags) const
  3579. {
  3580. if (mask.data())
  3581. mem[0] = m_data;
  3582. }
  3583. template <typename T>
  3584. template <class MT, class IT, int Scale>
  3585. Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
  3586. const Common::GatherArguments<MT, IT, Scale> &args)
  3587. {
  3588. m_data = args.address[Scale * args.indexes[0]];
  3589. }
  3590. template <typename T>
  3591. template <class MT, class IT, int Scale>
  3592. Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
  3593. const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
  3594. {
  3595. if (mask.data()) {
  3596. m_data = args.address[Scale * args.indexes[0]];
  3597. }
  3598. }
  3599. template <typename T>
  3600. template <typename MT, typename IT>
  3601. Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(MT *mem,
  3602. IT &&indexes)
  3603. const
  3604. {
  3605. mem[indexes[0]] = m_data;
  3606. }
  3607. template <typename T>
  3608. template <typename MT, typename IT>
  3609. Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(
  3610. MT *mem, IT &&indexes, MaskArgument mask) const
  3611. {
  3612. if (mask.data()) {
  3613. mem[indexes[0]] = m_data;
  3614. }
  3615. }
  3616. Vc_INTRINSIC Vc_CONST Scalar::float_v exponent(Scalar::float_v x)
  3617. {
  3618. Vc_ASSERT(x.data() >= 0.f);
  3619. union { float f; int i; } value;
  3620. value.f = x.data();
  3621. return Scalar::float_v(static_cast<float>((value.i >> 23) - 0x7f));
  3622. }
  3623. Vc_INTRINSIC Vc_CONST Scalar::double_v Vc_VDECL exponent(Scalar::double_v x)
  3624. {
  3625. Vc_ASSERT(x.data() >= 0.);
  3626. union { double f; long long i; } value;
  3627. value.f = x.data();
  3628. return Scalar::double_v(static_cast<double>((value.i >> 52) - 0x3ff));
  3629. }
  3630. static Vc_ALWAYS_INLINE void _doRandomStep(Scalar::uint_v &state0, Scalar::uint_v &state1)
  3631. {
  3632. using Scalar::uint_v;
  3633. state0.load(&Common::RandomState[0]);
  3634. state1.load(&Common::RandomState[uint_v::Size]);
  3635. Detail::operator+(Detail::operator*(state1, uint_v(0xdeece66du)),
  3636. uint_v(11))
  3637. .store(&Common::RandomState[uint_v::Size]);
  3638. uint_v(Detail::operator+(Detail::operator*(state0, uint_v(0xdeece66du)), uint_v(11))
  3639. .data() ^
  3640. (state1.data() >> 16))
  3641. .store(&Common::RandomState[0]);
  3642. }
  3643. template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Scalar> Vector<T, VectorAbi::Scalar>::Random()
  3644. {
  3645. Scalar::uint_v state0, state1;
  3646. _doRandomStep(state0, state1);
  3647. return Vector<T, VectorAbi::Scalar>(static_cast<EntryType>(state0.data()));
  3648. }
  3649. template<> Vc_INTRINSIC Scalar::float_v Scalar::float_v::Random()
  3650. {
  3651. Scalar::uint_v state0, state1;
  3652. _doRandomStep(state0, state1);
  3653. union { unsigned int i; float f; } x;
  3654. x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u;
  3655. return Scalar::float_v(x.f - 1.f);
  3656. }
  3657. template<> Vc_INTRINSIC Scalar::double_v Scalar::double_v::Random()
  3658. {
  3659. typedef unsigned long long uint64 Vc_MAY_ALIAS;
  3660. uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
  3661. state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull;
  3662. *reinterpret_cast<uint64 *>(&Common::RandomState[8]) = state0;
  3663. union { unsigned long long i; double f; } x;
  3664. x.i = state0 | 0x3ff0000000000000ull;
  3665. return Scalar::double_v(x.f - 1.);
  3666. }
  3667. Vc_INTRINSIC Vc_CONST Scalar::float_m isnegative(Scalar::float_v x)
  3668. {
  3669. static_assert(sizeof(float) == sizeof(unsigned int),
  3670. "This code assumes float and unsigned int have the same number of "
  3671. "Bytes. Please file a bug report if this is a problem.");
  3672. union { float f; unsigned int i; } u;
  3673. u.f = x.data();
  3674. return Scalar::float_m(0u != (u.i & 0x80000000u));
  3675. }
  3676. Vc_INTRINSIC Vc_CONST Scalar::double_m Vc_VDECL isnegative(Scalar::double_v x)
  3677. {
  3678. static_assert(sizeof(double) == sizeof(unsigned long long),
  3679. "This code assumes double and unsigned long long have the same number "
  3680. "of Bytes. Please file a bug report if this is a problem.");
  3681. union { double d; unsigned long long l; } u;
  3682. u.d = x.data();
  3683. return Scalar::double_m(0ull != (u.l & 0x8000000000000000ull));
  3684. }
  3685. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan()
  3686. {
  3687. union { float f; unsigned int i; } u;
  3688. u.i = 0xffffffffu;
  3689. m_data = u.f;
  3690. }
  3691. template<> Vc_INTRINSIC void Scalar::double_v::setQnan()
  3692. {
  3693. union { double d; unsigned long long l; } u;
  3694. u.l = 0xffffffffffffffffull;
  3695. m_data = u.d;
  3696. }
  3697. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan(Mask m)
  3698. {
  3699. if (m.data()) {
  3700. setQnan();
  3701. }
  3702. }
  3703. template<> Vc_INTRINSIC void Scalar::double_v::setQnan(Scalar::double_v::Mask m)
  3704. {
  3705. if (m.data()) {
  3706. setQnan();
  3707. }
  3708. }
  3709. namespace Common
  3710. {
  3711. Vc_ALWAYS_INLINE void transpose_impl(TransposeTag<1, 1>, Scalar::float_v *Vc_RESTRICT r[],
  3712. const TransposeProxy<Scalar::float_v> &proxy)
  3713. {
  3714. *r[0] = std::get<0>(proxy.in).data();
  3715. }
  3716. }
  3717. }
  3718. #ifndef VC_SCALAR_SIMD_CAST_H_
  3719. #define VC_SCALAR_SIMD_CAST_H_
  3720. #ifndef VC_COMMON_SIMD_CAST_H_
  3721. #define VC_COMMON_SIMD_CAST_H_
  3722. #include <type_traits>
  3723. template <class> void simd_cast();
  3724. namespace Vc_VERSIONED_NAMESPACE
  3725. {
  3726. template <typename To, typename From>
  3727. Vc_INTRINSIC Vc_CONST To
  3728. simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
  3729. {
  3730. return std::forward<From>(x);
  3731. }
  3732. template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }
  3733. }
  3734. #endif
  3735. #ifndef VC_SCALAR_TYPE_TRAITS_H_
  3736. #define VC_SCALAR_TYPE_TRAITS_H_
  3737. namespace Vc_VERSIONED_NAMESPACE
  3738. {
  3739. namespace Scalar
  3740. {
  3741. namespace Traits
  3742. {
  3743. template <typename T> struct is_vector : public std::false_type {};
  3744. template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
  3745. template <typename T> struct is_mask : public std::false_type {};
  3746. template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
  3747. }
  3748. }
  3749. }
  3750. #endif
  3751. namespace Vc_VERSIONED_NAMESPACE
  3752. {
  3753. template <typename To, typename From>
  3754. Vc_INTRINSIC Vc_CONST To
  3755. simd_cast(Scalar::Vector<From> x, enable_if<Scalar::is_vector<To>::value> = nullarg)
  3756. {
  3757. return static_cast<To>(x.data());
  3758. }
  3759. template <typename To, typename From>
  3760. Vc_INTRINSIC Vc_CONST To
  3761. simd_cast(Scalar::Mask<From> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
  3762. {
  3763. return static_cast<To>(x.data());
  3764. }
  3765. template <typename Return, int offset, typename T>
  3766. Vc_INTRINSIC Vc_CONST Return simd_cast(
  3767. T &&x,
  3768. enable_if<Traits::is_simd_vector<T>::value && Scalar::is_vector<Return>::value> = nullarg)
  3769. {
  3770. return Return(x[offset]);
  3771. }
  3772. template <typename Return, int offset, typename T>
  3773. Vc_INTRINSIC Vc_CONST enable_if<offset == 0 && Traits::is_simd_vector<Return>::value &&
  3774. !Scalar::is_vector<Return>::value,
  3775. Return>
  3776. simd_cast(Scalar::Vector<T> x)
  3777. {
  3778. Return r{};
  3779. r[0] = static_cast<typename Return::EntryType>(x.data());
  3780. return r;
  3781. }
  3782. template <typename Return, int offset, typename T>
  3783. Vc_INTRINSIC Vc_CONST Return simd_cast(
  3784. T &&x,
  3785. enable_if<Traits::is_simd_mask<T>::value && Scalar::is_mask<Return>::value> = nullarg)
  3786. {
  3787. return Return(bool(x[offset]));
  3788. }
  3789. template <typename Return, int offset, typename T>
  3790. Vc_INTRINSIC Vc_CONST enable_if<
  3791. offset == 0 && Traits::is_simd_mask<Return>::value && !Scalar::is_mask<Return>::value,
  3792. Return>
  3793. simd_cast(Scalar::Mask<T> x)
  3794. {
  3795. Return r(false);
  3796. r[0] = x[0];
  3797. return r;
  3798. }
  3799. }
  3800. #endif
  3801. #endif
  3802. #if defined(Vc_IMPL_SSE)
  3803. #ifndef VC_SSE_VECTOR_H_
  3804. #define VC_SSE_VECTOR_H_
  3805. #ifndef VC_SSE_INTRINSICS_H_
  3806. #define VC_SSE_INTRINSICS_H_
  3807. #ifdef Vc_MSVC
  3808. #include <intrin.h>
  3809. #else
  3810. #include <x86intrin.h>
  3811. #endif
  3812. #ifndef VC_COMMON_STORAGE_H_
  3813. #define VC_COMMON_STORAGE_H_
  3814. #ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
  3815. #define VC_COMMON_ALIASINGENTRYHELPER_H_
  3816. namespace Vc_VERSIONED_NAMESPACE
  3817. {
  3818. namespace Common
  3819. {
  3820. template<class StorageType> class AliasingEntryHelper
  3821. {
  3822. private:
  3823. typedef typename StorageType::EntryType T;
  3824. #ifdef Vc_ICC
  3825. StorageType *const m_storage;
  3826. const int m_index;
  3827. public:
  3828. Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
  3829. Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
  3830. Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
  3831. Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
  3832. m_storage->assign(m_index, rhs);
  3833. return *this;
  3834. }
  3835. Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; }
  3836. Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
  3837. Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
  3838. Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
  3839. Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
  3840. Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
  3841. Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
  3842. Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
  3843. Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
  3844. Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
  3845. Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
  3846. #define m_data m_storage->read(m_index)
  3847. #else
  3848. typedef T A Vc_MAY_ALIAS;
  3849. A &m_data;
  3850. public:
  3851. template<typename T2>
  3852. Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}
  3853. Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
  3854. Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
  3855. m_data = rhs.m_data;
  3856. return *this;
  3857. }
  3858. Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; }
  3859. Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
  3860. Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
  3861. Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
  3862. Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
  3863. Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
  3864. Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
  3865. Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
  3866. Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
  3867. Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
  3868. Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
  3869. #endif
  3870. Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }
  3871. Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
  3872. Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
  3873. Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
  3874. Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
  3875. Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) < x; }
  3876. Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) > x; }
  3877. Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
  3878. Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
  3879. Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
  3880. Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
  3881. Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
  3882. Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
  3883. Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
  3884. Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
  3885. Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
  3886. Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
  3887. #ifdef m_data
  3888. #undef m_data
  3889. #endif
  3890. };
  3891. }
  3892. }
  3893. #endif
  3894. #ifndef VC_COMMON_MASKENTRY_H_
  3895. #define VC_COMMON_MASKENTRY_H_
  3896. namespace Vc_VERSIONED_NAMESPACE
  3897. {
  3898. namespace Common
  3899. {
  3900. namespace
  3901. {
  3902. template<size_t Bytes> struct MaskBoolStorage;
  3903. template<> struct MaskBoolStorage<1> { typedef std::int8_t type; };
  3904. template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
  3905. template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
  3906. template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
  3907. }
  3908. template<size_t Bytes> class MaskBool
  3909. {
  3910. typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
  3911. storage_type data;
  3912. public:
  3913. constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
  3914. Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
  3915. template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
  3916. std::is_fundamental<T>::value)>>
  3917. Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
  3918. {
  3919. data = reinterpret_cast<const storage_type &>(x);
  3920. return *this;
  3921. }
  3922. Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
  3923. Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;
  3924. template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
  3925. (std::is_fundamental<T>::value &&
  3926. sizeof(storage_type) == sizeof(T)))>>
  3927. constexpr operator T() const noexcept
  3928. {
  3929. return std::is_same<T, bool>::value ? T((data & 1) != 0) : aliasing_cast<T>(data);
  3930. }
  3931. } Vc_MAY_ALIAS;
  3932. template <typename A,
  3933. typename B,
  3934. typename std::enable_if<
  3935. std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
  3936. int>::type = 0>
  3937. constexpr bool operator==(A &&a, B &&b)
  3938. {
  3939. return static_cast<bool>(a) == static_cast<bool>(b);
  3940. }
  3941. template <typename A,
  3942. typename B,
  3943. typename std::enable_if<
  3944. std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
  3945. int>::type = 0>
  3946. constexpr bool operator!=(A &&a, B &&b)
  3947. {
  3948. return static_cast<bool>(a) != static_cast<bool>(b);
  3949. }
  3950. }
  3951. }
  3952. #endif
  3953. #ifdef Vc_IMPL_AVX
  3954. #ifndef VC_AVX_INTRINSICS_H_
  3955. #define VC_AVX_INTRINSICS_H_
  3956. extern "C" {
  3957. #include <immintrin.h>
  3958. #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
  3959. #include <x86intrin.h>
  3960. #endif
  3961. }
  3962. #ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
  3963. #define VC_COMMON_FIX_CLANG_EMMINTRIN_H_
  3964. #if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)
  3965. #ifdef _mm_slli_si128
  3966. #undef _mm_slli_si128
  3967. #define _mm_slli_si128(a,count) __extension__ ({ \
  3968. (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
  3969. #endif
  3970. #ifdef _mm_srli_si128
  3971. #undef _mm_srli_si128
  3972. #define _mm_srli_si128(a,count) __extension__ ({ \
  3973. (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
  3974. #endif
  3975. #ifdef _mm_shuffle_epi32
  3976. #undef _mm_shuffle_epi32
  3977. #define _mm_shuffle_epi32(a,imm) __extension__ ({ \
  3978. (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
  3979. (imm) & 0x3, ((imm) & 0xc) >> 2, \
  3980. ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
  3981. #endif
  3982. #ifdef _mm_shufflelo_epi16
  3983. #undef _mm_shufflelo_epi16
  3984. #define _mm_shufflelo_epi16(a,imm) __extension__ ({ \
  3985. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
  3986. (imm) & 0x3, ((imm) & 0xc) >> 2, \
  3987. ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
  3988. 4, 5, 6, 7); })
  3989. #endif
  3990. #ifdef _mm_shufflehi_epi16
  3991. #undef _mm_shufflehi_epi16
  3992. #define _mm_shufflehi_epi16(a,imm) __extension__ ({ \
  3993. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
  3994. 0, 1, 2, 3, \
  3995. 4 + (((imm) & 0x03) >> 0), \
  3996. 4 + (((imm) & 0x0c) >> 2), \
  3997. 4 + (((imm) & 0x30) >> 4), \
  3998. 4 + (((imm) & 0xc0) >> 6)); })
  3999. #endif
  4000. #ifdef _mm_shuffle_pd
  4001. #undef _mm_shuffle_pd
  4002. #define _mm_shuffle_pd(a,b,i) __extension__ ({ \
  4003. __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
  4004. #endif
  4005. #endif
  4006. #endif
  4007. #ifndef VC_AVX_CONST_DATA_H_
  4008. #define VC_AVX_CONST_DATA_H_
  4009. namespace Vc_VERSIONED_NAMESPACE
  4010. {
  4011. namespace AVX
  4012. {
  4013. alignas(64) extern const unsigned int _IndexesFromZero32[ 8];
  4014. alignas(16) extern const unsigned short _IndexesFromZero16[16];
  4015. alignas(16) extern const unsigned char _IndexesFromZero8 [32];
  4016. struct alignas(64) c_general
  4017. {
  4018. static const float oneFloat;
  4019. static const unsigned int absMaskFloat[2];
  4020. static const unsigned int signMaskFloat[2];
  4021. static const unsigned int highMaskFloat;
  4022. static const unsigned short minShort[2];
  4023. static const unsigned short one16[2];
  4024. static const float _2power31;
  4025. static const double oneDouble;
  4026. static const unsigned long long frexpMask;
  4027. static const unsigned long long highMaskDouble;
  4028. };
  4029. template<typename T> struct c_trig
  4030. {
  4031. alignas(64) static const T data[];
  4032. };
  4033. #ifndef Vc_MSVC
  4034. template <> alignas(64) const float c_trig<float>::data[];
  4035. template <> alignas(64) const double c_trig<double>::data[];
  4036. #endif
  4037. template<typename T> struct c_log
  4038. {
  4039. typedef float floatAlias Vc_MAY_ALIAS;
  4040. static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
  4041. alignas(64) static const unsigned int data[21];
  4042. };
  4043. #ifndef Vc_MSVC
  4044. template<> alignas(64) const unsigned int c_log<float>::data[21];
  4045. #endif
  4046. template<> struct c_log<double>
  4047. {
  4048. enum VectorSize { Size = 16 / sizeof(double) };
  4049. typedef double doubleAlias Vc_MAY_ALIAS;
  4050. static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
  4051. alignas(64) static const unsigned long long data[21];
  4052. };
  4053. }
  4054. }
  4055. namespace Vc_VERSIONED_NAMESPACE
  4056. {
  4057. namespace AVX2
  4058. {
  4059. using AVX::_IndexesFromZero8;
  4060. using AVX::_IndexesFromZero16;
  4061. using AVX::_IndexesFromZero32;
  4062. using AVX::c_general;
  4063. using AVX::c_trig;
  4064. using AVX::c_log;
  4065. }
  4066. }
  4067. #endif
  4068. #include <cstdlib>
  4069. #if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
  4070. #ifdef _mm256_permute2f128_si256
  4071. #undef _mm256_permute2f128_si256
  4072. #define _mm256_permute2f128_si256(V1,V2,M) __extension__ ({ \
  4073. (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
  4074. (__v8si)(__m256i)(V2), (char)(M)); })
  4075. #endif
  4076. #ifdef _mm256_permute2f128_ps
  4077. #undef _mm256_permute2f128_ps
  4078. #define _mm256_permute2f128_ps(V1,V2,M) __extension__ ({ \
  4079. (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
  4080. (__v8sf)(__m256)(V2), (char)(M)); })
  4081. #endif
  4082. #ifdef _mm256_permute2x128_si256
  4083. #undef _mm256_permute2x128_si256
  4084. #define _mm256_permute2x128_si256(V1,V2,M) __extension__ ({ \
  4085. (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
  4086. #endif
  4087. #endif
  4088. namespace Vc_VERSIONED_NAMESPACE
  4089. {
  4090. namespace AvxIntrinsics
  4091. {
  4092. using AVX::c_general;
  4093. using AVX::_IndexesFromZero32;
  4094. using AVX::_IndexesFromZero16;
  4095. using AVX::_IndexesFromZero8;
  4096. typedef __m128 m128 ;
  4097. typedef __m128d m128d;
  4098. typedef __m128i m128i;
  4099. typedef __m256 m256 ;
  4100. typedef __m256d m256d;
  4101. typedef __m256i m256i;
  4102. #ifdef Vc_GCC
  4103. static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
  4104. static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
  4105. static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
  4106. static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
  4107. static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
  4108. static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
  4109. #endif
  4110. static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); }
  4111. static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); }
  4112. static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
  4113. static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
  4114. static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
  4115. static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
  4116. static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
  4117. static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
  4118. static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); }
  4119. static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); }
  4120. static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
  4121. static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); }
  4122. static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
  4123. static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); }
  4124. static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
  4125. static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
  4126. static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
  4127. static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
  4128. static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
  4129. static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
  4130. static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
  4131. static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); }
  4132. static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
  4133. static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
  4134. static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
  4135. static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
  4136. static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
  4137. static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
  4138. static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
  4139. template <int i>
  4140. static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
  4141. {
  4142. return _mm_extract_epi32(x, i);
  4143. }
  4144. template <int offset> Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); }
  4145. template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
  4146. template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
  4147. #ifdef Vc_IMPL_AVX2
  4148. return _mm256_inserti128_si256(a, b, offset);
  4149. #else
  4150. return _mm256_insertf128_si256(a, b, offset);
  4151. #endif
  4152. }
  4153. template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
  4154. template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
  4155. template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
  4156. #ifdef Vc_IMPL_AVX2
  4157. return _mm256_extracti128_si256(a, offset);
  4158. #else
  4159. return _mm256_extractf128_si256(a, offset);
  4160. #endif
  4161. }
  4162. #ifdef Vc_GCC
  4163. Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
  4164. Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
  4165. Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
  4166. Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
  4167. Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
  4168. Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
  4169. Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); }
  4170. Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); }
  4171. Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); }
  4172. Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); }
  4173. Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); }
  4174. Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); }
  4175. #else
  4176. Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
  4177. Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
  4178. Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
  4179. Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
  4180. Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
  4181. Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
  4182. Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
  4183. Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
  4184. Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
  4185. Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
  4186. Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
  4187. Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
  4188. #endif
  4189. Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); }
  4190. Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
  4191. Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); }
  4192. Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); }
  4193. Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
  4194. Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
  4195. Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
  4196. Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
  4197. #if defined(Vc_IMPL_XOP)
  4198. static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
  4199. return _mm_comlt_epu16(a, b);
  4200. }
  4201. static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
  4202. return _mm_comgt_epu16(a, b);
  4203. }
  4204. #else
  4205. static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
  4206. return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
  4207. }
  4208. static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
  4209. return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
  4210. }
  4211. #endif
  4212. #ifdef Vc_IMPL_AVX2
  4213. template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
  4214. {
  4215. return _mm256_alignr_epi8(s1, s2, shift);
  4216. }
  4217. #else
  4218. template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
  4219. {
  4220. return insert128<1>(
  4221. _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
  4222. _mm256_castsi256_si128(s2), shift)),
  4223. _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
  4224. }
  4225. #endif
  4226. #ifdef Vc_IMPL_AVX2
  4227. #define Vc_AVX_TO_SSE_2_NEW(name) \
  4228. Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
  4229. { \
  4230. return _mm256_##name(a0, b0); \
  4231. }
  4232. #define Vc_AVX_TO_SSE_256_128(name) \
  4233. Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
  4234. { \
  4235. return _mm256_##name(a0, b0); \
  4236. }
  4237. #define Vc_AVX_TO_SSE_1i(name) \
  4238. template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
  4239. { \
  4240. return _mm256_##name(a0, i); \
  4241. }
  4242. #define Vc_AVX_TO_SSE_1(name) \
  4243. Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
  4244. #define Vc_AVX_TO_SSE_1_128(name,shift__) \
  4245. Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
  4246. #else
  4247. #define Vc_AVX_TO_SSE_1(name) \
  4248. Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
  4249. { \
  4250. __m128i a1 = extract128<1>(a0); \
  4251. __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
  4252. __m128i r1 = _mm_##name(a1); \
  4253. return insert128<1>(_mm256_castsi128_si256(r0), r1); \
  4254. }
  4255. #define Vc_AVX_TO_SSE_1_128(name,shift__) \
  4256. Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \
  4257. { \
  4258. __m128i r0 = _mm_##name(a0); \
  4259. __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \
  4260. return insert128<1>(_mm256_castsi128_si256(r0), r1); \
  4261. }
  4262. #define Vc_AVX_TO_SSE_2_NEW(name) \
  4263. Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
  4264. { \
  4265. m128i a1 = extract128<1>(a0); \
  4266. m128i b1 = extract128<1>(b0); \
  4267. m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
  4268. m128i r1 = _mm_##name(a1, b1); \
  4269. return insert128<1>(_mm256_castsi128_si256(r0), r1); \
  4270. }
  4271. #define Vc_AVX_TO_SSE_256_128(name) \
  4272. Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
  4273. { \
  4274. m128i a1 = extract128<1>(a0); \
  4275. m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
  4276. m128i r1 = _mm_##name(a1, b0); \
  4277. return insert128<1>(_mm256_castsi128_si256(r0), r1); \
  4278. }
  4279. #define Vc_AVX_TO_SSE_1i(name) \
  4280. template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
  4281. { \
  4282. m128i a1 = extract128<1>(a0); \
  4283. m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
  4284. m128i r1 = _mm_##name(a1, i); \
  4285. return insert128<1>(_mm256_castsi128_si256(r0), r1); \
  4286. }
  4287. #endif
  4288. Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
  4289. Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
  4290. Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
  4291. Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
  4292. Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
  4293. Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
  4294. Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
  4295. Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
  4296. Vc_AVX_TO_SSE_1i(slli_epi16)
  4297. Vc_AVX_TO_SSE_1i(slli_epi32)
  4298. Vc_AVX_TO_SSE_1i(slli_epi64)
  4299. Vc_AVX_TO_SSE_1i(srai_epi16)
  4300. Vc_AVX_TO_SSE_1i(srai_epi32)
  4301. Vc_AVX_TO_SSE_1i(srli_epi16)
  4302. Vc_AVX_TO_SSE_1i(srli_epi32)
  4303. Vc_AVX_TO_SSE_1i(srli_epi64)
  4304. Vc_AVX_TO_SSE_256_128(sll_epi16)
  4305. Vc_AVX_TO_SSE_256_128(sll_epi32)
  4306. Vc_AVX_TO_SSE_256_128(sll_epi64)
  4307. Vc_AVX_TO_SSE_256_128(srl_epi16)
  4308. Vc_AVX_TO_SSE_256_128(srl_epi32)
  4309. Vc_AVX_TO_SSE_256_128(srl_epi64)
  4310. Vc_AVX_TO_SSE_256_128(sra_epi16)
  4311. Vc_AVX_TO_SSE_256_128(sra_epi32)
  4312. Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
  4313. Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
  4314. Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
  4315. Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
  4316. Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
  4317. Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
  4318. Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
  4319. Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
  4320. Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
  4321. Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
  4322. Vc_AVX_TO_SSE_2_NEW(add_epi16)
  4323. Vc_AVX_TO_SSE_2_NEW(add_epi32)
  4324. Vc_AVX_TO_SSE_2_NEW(add_epi64)
  4325. Vc_AVX_TO_SSE_2_NEW(sub_epi16)
  4326. Vc_AVX_TO_SSE_2_NEW(sub_epi32)
  4327. Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
  4328. Vc_AVX_TO_SSE_2_NEW(sign_epi16)
  4329. Vc_AVX_TO_SSE_2_NEW(sign_epi32)
  4330. Vc_AVX_TO_SSE_2_NEW(min_epi8)
  4331. Vc_AVX_TO_SSE_2_NEW(max_epi8)
  4332. Vc_AVX_TO_SSE_2_NEW(min_epu16)
  4333. Vc_AVX_TO_SSE_2_NEW(max_epu16)
  4334. Vc_AVX_TO_SSE_2_NEW(min_epi32)
  4335. Vc_AVX_TO_SSE_2_NEW(max_epi32)
  4336. Vc_AVX_TO_SSE_2_NEW(min_epu32)
  4337. Vc_AVX_TO_SSE_2_NEW(max_epu32)
  4338. Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
  4339. Vc_AVX_TO_SSE_1(abs_epi8)
  4340. Vc_AVX_TO_SSE_1(abs_epi16)
  4341. Vc_AVX_TO_SSE_1(abs_epi32)
  4342. Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
  4343. Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
  4344. Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
  4345. Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
  4346. Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
  4347. Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
  4348. Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
  4349. Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
  4350. Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
  4351. Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
  4352. Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
  4353. Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
  4354. #ifndef Vc_IMPL_AVX2
  4355. static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
  4356. return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
  4357. }
  4358. static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
  4359. return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
  4360. }
  4361. static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
  4362. return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
  4363. }
  4364. static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
  4365. return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
  4366. }
  4367. Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
  4368. {
  4369. m128i a1 = extract128<1>(a0);
  4370. return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
  4371. }
  4372. template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
  4373. {
  4374. m128i a1 = extract128<1>(a0);
  4375. m128i b1 = extract128<1>(b0);
  4376. m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
  4377. m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
  4378. return insert128<1>(_mm256_castsi128_si256(r0), r1);
  4379. }
  4380. Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
  4381. m128i a1 = extract128<1>(a0);
  4382. m128i b1 = extract128<1>(b0);
  4383. m128i m1 = extract128<1>(m0);
  4384. m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
  4385. m128i r1 = _mm_blendv_epi8(a1, b1, m1);
  4386. return insert128<1>(_mm256_castsi128_si256(r0), r1);
  4387. }
  4388. #else
  4389. static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
  4390. static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
  4391. static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
  4392. static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
  4393. Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
  4394. {
  4395. return _mm256_blendv_epi8(a0, b0, m0);
  4396. }
  4397. Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
  4398. {
  4399. return _mm256_movemask_epi8(a0);
  4400. }
  4401. #endif
  4402. static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
  4403. return cmpgt_epi64(b, a);
  4404. }
  4405. static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
  4406. return cmpgt_epi32(b, a);
  4407. }
  4408. static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
  4409. return cmpgt_epi16(b, a);
  4410. }
  4411. static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
  4412. return cmpgt_epi8(b, a);
  4413. }
  4414. static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
  4415. return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
  4416. }
  4417. #if defined(Vc_IMPL_XOP)
  4418. Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
  4419. Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
  4420. Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
  4421. Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
  4422. static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
  4423. static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
  4424. static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
  4425. static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
  4426. #else
  4427. static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
  4428. m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
  4429. m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
  4430. return cmplt_epi32(a, b);
  4431. }
  4432. static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
  4433. m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
  4434. m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
  4435. return cmpgt_epi32(a, b);
  4436. }
  4437. static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
  4438. m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
  4439. m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
  4440. return cmplt_epi16(a, b);
  4441. }
  4442. static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
  4443. m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
  4444. m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
  4445. return cmpgt_epi16(a, b);
  4446. }
  4447. #endif
  4448. static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
  4449. _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
  4450. }
  4451. static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
  4452. _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
  4453. }
  4454. static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
  4455. #ifdef Vc_IMPL_AVX2
  4456. _mm256_maskstore_epi32(mem, mask, v);
  4457. #else
  4458. _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
  4459. #endif
  4460. }
  4461. static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
  4462. _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
  4463. }
  4464. static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
  4465. using namespace AVX;
  4466. _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
  4467. _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
  4468. }
  4469. static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
  4470. _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
  4471. }
  4472. #undef Vc_AVX_TO_SSE_1
  4473. #undef Vc_AVX_TO_SSE_1_128
  4474. #undef Vc_AVX_TO_SSE_2_NEW
  4475. #undef Vc_AVX_TO_SSE_256_128
  4476. #undef Vc_AVX_TO_SSE_1i
  4477. template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
  4478. template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
  4479. {
  4480. return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
  4481. }
  4482. template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
  4483. {
  4484. return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
  4485. stream_load<m128>(mem + 4));
  4486. }
  4487. template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
  4488. template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
  4489. {
  4490. return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
  4491. }
  4492. template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
  4493. {
  4494. return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
  4495. stream_load<m128d>(mem + 2));
  4496. }
  4497. template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
  4498. template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
  4499. {
  4500. return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
  4501. }
  4502. template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
  4503. {
  4504. return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
  4505. stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
  4506. }
  4507. Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
  4508. {
  4509. _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
  4510. }
  4511. Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
  4512. {
  4513. stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
  4514. stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
  4515. }
  4516. Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
  4517. {
  4518. _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
  4519. }
  4520. Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
  4521. {
  4522. stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
  4523. stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
  4524. }
  4525. Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
  4526. {
  4527. _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
  4528. }
  4529. Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
  4530. {
  4531. stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
  4532. stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
  4533. }
  4534. #ifndef __x86_64__
  4535. Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
  4536. return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
  4537. }
  4538. #endif
  4539. #ifdef Vc_IMPL_AVX2
  4540. template <int Scale> __m256 gather(const float *addr, __m256i idx)
  4541. {
  4542. return _mm256_i32gather_ps(addr, idx, Scale);
  4543. }
  4544. template <int Scale> __m256d gather(const double *addr, __m128i idx)
  4545. {
  4546. return _mm256_i32gather_pd(addr, idx, Scale);
  4547. }
  4548. template <int Scale> __m256i gather(const int *addr, __m256i idx)
  4549. {
  4550. return _mm256_i32gather_epi32(addr, idx, Scale);
  4551. }
  4552. template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
  4553. {
  4554. return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
  4555. }
  4556. template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
  4557. {
  4558. return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
  4559. }
  4560. template <int Scale>
  4561. __m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
  4562. {
  4563. return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
  4564. }
  4565. template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
  4566. {
  4567. return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
  4568. }
  4569. template <int Scale>
  4570. __m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
  4571. {
  4572. return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
  4573. }
  4574. #endif
  4575. }
  4576. }
  4577. namespace Vc_VERSIONED_NAMESPACE
  4578. {
  4579. namespace AVX
  4580. {
  4581. using namespace AvxIntrinsics;
  4582. }
  4583. namespace AVX2
  4584. {
  4585. using namespace AvxIntrinsics;
  4586. }
  4587. namespace AVX
  4588. {
  4589. template<typename T> struct VectorTypeHelper;
  4590. template<> struct VectorTypeHelper< char > { typedef __m256i Type; };
  4591. template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; };
  4592. template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
  4593. template<> struct VectorTypeHelper< short> { typedef __m256i Type; };
  4594. template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
  4595. template<> struct VectorTypeHelper< int > { typedef __m256i Type; };
  4596. template<> struct VectorTypeHelper<unsigned int > { typedef __m256i Type; };
  4597. template<> struct VectorTypeHelper< long > { typedef __m256i Type; };
  4598. template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
  4599. template<> struct VectorTypeHelper< long long> { typedef __m256i Type; };
  4600. template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
  4601. template<> struct VectorTypeHelper< float> { typedef __m256 Type; };
  4602. template<> struct VectorTypeHelper< double> { typedef __m256d Type; };
  4603. template <typename T>
  4604. using IntegerVectorType =
  4605. typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
  4606. template <typename T>
  4607. using DoubleVectorType =
  4608. typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
  4609. template <typename T>
  4610. using FloatVectorType =
  4611. typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
  4612. template<typename T> struct VectorHelper {};
  4613. template<typename T> struct VectorHelperSize;
  4614. }
  4615. }
  4616. #endif
  4617. #endif
  4618. namespace Vc_VERSIONED_NAMESPACE
  4619. {
  4620. namespace Detail
  4621. {
  4622. template <typename V> inline V zero();
  4623. }
  4624. namespace Common
  4625. {
  4626. namespace Detail
  4627. {
  4628. #ifdef Vc_IMPL_AVX
  4629. template <typename ValueType, size_t Size> struct IntrinsicType {
  4630. using type = typename std::conditional<
  4631. std::is_integral<ValueType>::value,
  4632. typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
  4633. typename std::conditional<
  4634. std::is_same<ValueType, double>::value,
  4635. typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
  4636. __m256d>::type,
  4637. typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
  4638. __m256>::type>::type>::type;
  4639. };
  4640. #elif defined Vc_IMPL_SSE
  4641. template <typename ValueType, size_t Size> struct IntrinsicType {
  4642. using type = typename std::conditional<
  4643. std::is_integral<ValueType>::value, __m128i,
  4644. typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
  4645. __m128>::type>::type;
  4646. };
  4647. #else
  4648. template <typename ValueType, size_t Size> struct IntrinsicType {
  4649. static_assert(Size == 1,
  4650. "IntrinsicType without SIMD target support may only have Size = 1");
  4651. using type = ValueType;
  4652. };
  4653. #endif
  4654. template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
  4655. struct BuiltinType;
  4656. #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
  4657. #define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
  4658. template <size_t Size> struct BuiltinType< double , Size, 16> { typedef double type Vc_VECBUILTIN; };
  4659. template <size_t Size> struct BuiltinType< float , Size, 16> { typedef float type Vc_VECBUILTIN; };
  4660. template <size_t Size> struct BuiltinType< long long, Size, 16> { typedef long long type Vc_VECBUILTIN; };
  4661. template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
  4662. template <size_t Size> struct BuiltinType< long , Size, 16> { typedef long type Vc_VECBUILTIN; };
  4663. template <size_t Size> struct BuiltinType<unsigned long , Size, 16> { typedef unsigned long type Vc_VECBUILTIN; };
  4664. template <size_t Size> struct BuiltinType< int , Size, 16> { typedef int type Vc_VECBUILTIN; };
  4665. template <size_t Size> struct BuiltinType<unsigned int , Size, 16> { typedef unsigned int type Vc_VECBUILTIN; };
  4666. template <size_t Size> struct BuiltinType< short , Size, 16> { typedef short type Vc_VECBUILTIN; };
  4667. template <size_t Size> struct BuiltinType<unsigned short , Size, 16> { typedef unsigned short type Vc_VECBUILTIN; };
  4668. template <size_t Size> struct BuiltinType< char , Size, 16> { typedef char type Vc_VECBUILTIN; };
  4669. template <size_t Size> struct BuiltinType<unsigned char , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
  4670. template <size_t Size> struct BuiltinType< signed char , Size, 16> { typedef signed char type Vc_VECBUILTIN; };
  4671. template <size_t Size> struct BuiltinType< bool , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
  4672. #undef Vc_VECBUILTIN
  4673. #define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
  4674. template <size_t Size> struct BuiltinType< double , Size, 32> { typedef double type Vc_VECBUILTIN; };
  4675. template <size_t Size> struct BuiltinType< float , Size, 32> { typedef float type Vc_VECBUILTIN; };
  4676. template <size_t Size> struct BuiltinType< long long, Size, 32> { typedef long long type Vc_VECBUILTIN; };
  4677. template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
  4678. template <size_t Size> struct BuiltinType< long , Size, 32> { typedef long type Vc_VECBUILTIN; };
  4679. template <size_t Size> struct BuiltinType<unsigned long , Size, 32> { typedef unsigned long type Vc_VECBUILTIN; };
  4680. template <size_t Size> struct BuiltinType< int , Size, 32> { typedef int type Vc_VECBUILTIN; };
  4681. template <size_t Size> struct BuiltinType<unsigned int , Size, 32> { typedef unsigned int type Vc_VECBUILTIN; };
  4682. template <size_t Size> struct BuiltinType< short , Size, 32> { typedef short type Vc_VECBUILTIN; };
  4683. template <size_t Size> struct BuiltinType<unsigned short , Size, 32> { typedef unsigned short type Vc_VECBUILTIN; };
  4684. template <size_t Size> struct BuiltinType< char , Size, 32> { typedef char type Vc_VECBUILTIN; };
  4685. template <size_t Size> struct BuiltinType<unsigned char , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
  4686. template <size_t Size> struct BuiltinType< signed char , Size, 32> { typedef signed char type Vc_VECBUILTIN; };
  4687. template <size_t Size> struct BuiltinType< bool , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
  4688. #undef Vc_VECBUILTIN
  4689. #endif
  4690. }
  4691. template <typename ValueType, size_t Size>
  4692. using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;
  4693. template <typename ValueType, size_t Size>
  4694. using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;
  4695. namespace AliasStrategy
  4696. {
  4697. struct Union {};
  4698. struct MayAlias {};
  4699. struct VectorBuiltin {};
  4700. struct UnionMembers {};
  4701. }
  4702. using DefaultStrategy =
  4703. #if defined Vc_USE_BUILTIN_VECTOR_TYPES
  4704. AliasStrategy::VectorBuiltin;
  4705. #elif defined Vc_MSVC
  4706. AliasStrategy::UnionMembers;
  4707. #elif defined Vc_ICC
  4708. AliasStrategy::Union;
  4709. #elif defined __GNUC__
  4710. AliasStrategy::MayAlias;
  4711. #else
  4712. AliasStrategy::Union;
  4713. #endif
  4714. template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
  4715. class Storage;
  4716. template <typename ValueType, size_t Size>
  4717. class Storage<ValueType, Size, AliasStrategy::Union>
  4718. {
  4719. static_assert(std::is_fundamental<ValueType>::value &&
  4720. std::is_arithmetic<ValueType>::value,
  4721. "Only works for fundamental arithmetic types.");
  4722. public:
  4723. using VectorType = IntrinsicType<ValueType, Size>;
  4724. using EntryType = ValueType;
  4725. union Alias {
  4726. Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
  4727. VectorType v;
  4728. EntryType m[Size];
  4729. };
  4730. Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
  4731. Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
  4732. template <typename U>
  4733. Vc_INTRINSIC explicit Storage(const U &x,
  4734. enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
  4735. : data(reinterpret_cast<VectorType>(x))
  4736. {
  4737. assertCorrectAlignment(&data);
  4738. }
  4739. Vc_INTRINSIC Storage(const Storage &) = default;
  4740. Vc_INTRINSIC Storage &operator=(const Storage &) = default;
  4741. Vc_INTRINSIC operator const VectorType &() const { return data; }
  4742. Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
  4743. Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
  4744. Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
  4745. Vc_INTRINSIC void set(size_t i, EntryType x)
  4746. {
  4747. Alias a(data);
  4748. a.m[i] = x;
  4749. data = a.v;
  4750. }
  4751. private:
  4752. VectorType data;
  4753. };
  4754. template <typename ValueType, size_t Size>
  4755. class Storage<ValueType, Size, AliasStrategy::MayAlias>
  4756. {
  4757. static_assert(std::is_fundamental<ValueType>::value &&
  4758. std::is_arithmetic<ValueType>::value,
  4759. "Only works for fundamental arithmetic types.");
  4760. public:
  4761. using VectorType = IntrinsicType<ValueType, Size>;
  4762. using EntryType = ValueType;
  4763. Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
  4764. Vc_INTRINSIC Storage(const VectorType &x) : data(x)
  4765. {
  4766. assertCorrectAlignment(&data);
  4767. }
  4768. template <typename U>
  4769. Vc_INTRINSIC explicit Storage(const U &x,
  4770. enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
  4771. : data(reinterpret_cast<const VectorType &>(x))
  4772. {
  4773. assertCorrectAlignment(&data);
  4774. }
  4775. Vc_INTRINSIC Storage &operator=(const VectorType &x)
  4776. {
  4777. data = x;
  4778. return *this;
  4779. }
  4780. Vc_INTRINSIC Storage(const Storage &) = default;
  4781. Vc_INTRINSIC Storage &operator=(const Storage &) = default;
  4782. Vc_INTRINSIC operator const VectorType &() const { return v(); }
  4783. Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
  4784. Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
  4785. Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
  4786. {
  4787. return aliasing_cast<EntryType>(&data)[i];
  4788. }
  4789. Vc_INTRINSIC void set(size_t i, EntryType x)
  4790. {
  4791. aliasing_cast<EntryType>(&data)[i] = x;
  4792. }
  4793. private:
  4794. VectorType data;
  4795. };
  4796. template <typename ValueType, size_t Size>
  4797. class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
  4798. {
  4799. static_assert(std::is_fundamental<ValueType>::value &&
  4800. std::is_arithmetic<ValueType>::value,
  4801. "Only works for fundamental arithmetic types.");
  4802. using Builtin = BuiltinType<ValueType, Size>;
  4803. public:
  4804. using VectorType =
  4805. #ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
  4806. MayAlias<IntrinsicType<ValueType, Size>>;
  4807. #else
  4808. IntrinsicType<ValueType, Size>;
  4809. #endif
  4810. using EntryType = ValueType;
  4811. Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
  4812. Vc_INTRINSIC Storage(const Storage &) = default;
  4813. Vc_INTRINSIC Storage &operator=(const Storage &) = default;
  4814. Vc_INTRINSIC Storage(const VectorType &x)
  4815. : data(aliasing_cast<Builtin>(x))
  4816. {
  4817. assertCorrectAlignment(&data);
  4818. }
  4819. template <typename U>
  4820. Vc_INTRINSIC explicit Storage(const U &x,
  4821. enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
  4822. : data(aliasing_cast<Builtin>(x))
  4823. {
  4824. assertCorrectAlignment(&data);
  4825. }
  4826. Vc_INTRINSIC Storage &operator=(const VectorType &x)
  4827. {
  4828. data = aliasing_cast<Builtin>(x);
  4829. return *this;
  4830. }
  4831. Vc_INTRINSIC operator const VectorType &() const { return v(); }
  4832. Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
  4833. Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }
  4834. Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
  4835. Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }
  4836. Vc_INTRINSIC Builtin &builtin() { return data; }
  4837. Vc_INTRINSIC const Builtin &builtin() const { return data; }
  4838. private:
  4839. Builtin data;
  4840. };
  4841. template <typename ValueType, size_t Size>
  4842. class Storage<ValueType, Size, AliasStrategy::UnionMembers>
  4843. {
  4844. static_assert(std::is_fundamental<ValueType>::value &&
  4845. std::is_arithmetic<ValueType>::value,
  4846. "Only works for fundamental arithmetic types.");
  4847. public:
  4848. using VectorType = IntrinsicType<ValueType, Size>;
  4849. using EntryType = ValueType;
  4850. Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
  4851. Vc_INTRINSIC Storage(const VectorType &x) : data(x)
  4852. {
  4853. assertCorrectAlignment(&data);
  4854. }
  4855. template <typename U>
  4856. Vc_INTRINSIC explicit Storage(const U &x,
  4857. enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
  4858. : data(reinterpret_cast<const VectorType &>(x))
  4859. {
  4860. assertCorrectAlignment(&data);
  4861. }
  4862. Vc_INTRINSIC Storage &operator=(const VectorType &x)
  4863. {
  4864. data = x;
  4865. return *this;
  4866. }
  4867. Vc_INTRINSIC Storage(const Storage &) = default;
  4868. Vc_INTRINSIC Storage &operator=(const Storage &) = default;
  4869. Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
  4870. Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
  4871. Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
  4872. Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }
  4873. private:
  4874. Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
  4875. VectorType data;
  4876. };
  4877. #ifdef Vc_MSVC
  4878. template <> Vc_INTRINSIC Vc_PURE double Storage< double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
  4879. template <> Vc_INTRINSIC Vc_PURE float Storage< float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
  4880. template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
  4881. template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
  4882. template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
  4883. template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
  4884. template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
  4885. template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }
  4886. template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
  4887. template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
  4888. template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
  4889. template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
  4890. template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
  4891. template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
  4892. template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
  4893. template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }
  4894. #ifdef Vc_IMPL_AVX
  4895. template <> Vc_INTRINSIC Vc_PURE double Storage< double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
  4896. template <> Vc_INTRINSIC Vc_PURE float Storage< float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
  4897. template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
  4898. template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
  4899. template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
  4900. template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
  4901. template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
  4902. template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }
  4903. template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
  4904. template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
  4905. template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
  4906. template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
  4907. template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
  4908. template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
  4909. template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
  4910. template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
  4911. #endif
  4912. #endif
  4913. template <typename VectorType, typename EntryType>
  4914. using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;
  4915. }
  4916. }
  4917. #endif
  4918. #ifndef VC_SSE_CONST_DATA_H_
  4919. #define VC_SSE_CONST_DATA_H_
  4920. #ifndef VC_SSE_MACROS_H_
  4921. #define VC_SSE_MACROS_H_
  4922. #if defined(Vc_IMPL_SSE4_1) && !defined(Vc_DISABLE_PTEST)
  4923. #define Vc_USE_PTEST
  4924. #endif
  4925. #endif
  4926. namespace Vc_VERSIONED_NAMESPACE
  4927. {
  4928. namespace SSE
  4929. {
  4930. alignas(16) extern const unsigned int _IndexesFromZero4[4];
  4931. alignas(16) extern const unsigned short _IndexesFromZero8[8];
  4932. alignas(16) extern const unsigned char _IndexesFromZero16[16];
  4933. struct c_general
  4934. {
  4935. alignas(64) static const int absMaskFloat[4];
  4936. alignas(16) static const unsigned int signMaskFloat[4];
  4937. alignas(16) static const unsigned int highMaskFloat[4];
  4938. alignas(16) static const short minShort[8];
  4939. alignas(16) static const unsigned short one16[8];
  4940. alignas(16) static const unsigned int one32[4];
  4941. alignas(16) static const float oneFloat[4];
  4942. alignas(16) static const unsigned long long highMaskDouble[2];
  4943. alignas(16) static const double oneDouble[2];
  4944. alignas(16) static const long long absMaskDouble[2];
  4945. alignas(16) static const unsigned long long signMaskDouble[2];
  4946. alignas(16) static const unsigned long long frexpMask[2];
  4947. };
  4948. template<typename T> struct c_trig
  4949. {
  4950. alignas(64) static const T data[];
  4951. };
  4952. #ifndef Vc_MSVC
  4953. template <> alignas(64) const float c_trig<float>::data[];
  4954. template <> alignas(64) const double c_trig<double>::data[];
  4955. #endif
  4956. template<typename T> struct c_log
  4957. {
  4958. enum VectorSize { Size = 16 / sizeof(T) };
  4959. static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast<const float *>(&data[i * Size]); }
  4960. alignas(64) static const unsigned int data[21 * Size];
  4961. };
  4962. #ifndef Vc_MSVC
  4963. template<> alignas(64) const unsigned int c_log<float>::data[21 * 4];
  4964. #endif
  4965. template<> struct c_log<double>
  4966. {
  4967. enum VectorSize { Size = 16 / sizeof(double) };
  4968. static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast<const double *>(&data[i * Size]); }
  4969. alignas(64) static const unsigned long long data[21 * Size];
  4970. };
  4971. }
  4972. }
  4973. #endif
  4974. #include <cstdlib>
  4975. #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
  4976. #pragma GCC diagnostic push
  4977. #pragma GCC diagnostic ignored "-Wold-style-cast"
  4978. #endif
  4979. namespace Vc_VERSIONED_NAMESPACE
  4980. {
  4981. namespace SseIntrinsics
  4982. {
  4983. using SSE::c_general;
  4984. constexpr std::size_t VectorAlignment = 16;
  4985. #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
  4986. static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
  4987. static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
  4988. static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
  4989. static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
  4990. static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
  4991. static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
  4992. #endif
  4993. #ifdef Vc_GCC
  4994. static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
  4995. static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
  4996. static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
  4997. static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
  4998. static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
  4999. static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
  5000. #endif
  5001. static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
  5002. static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
  5003. static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
  5004. static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
  5005. static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
  5006. static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
  5007. static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
  5008. static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
  5009. static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
  5010. static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
  5011. static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
  5012. static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
  5013. static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
  5014. static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
  5015. static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
  5016. static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
  5017. #if defined(Vc_IMPL_XOP)
  5018. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
  5019. static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
  5020. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
  5021. static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
  5022. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
  5023. static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
  5024. #else
  5025. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
  5026. {
  5027. return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
  5028. _mm_xor_si128(b, setmin_epi8()));
  5029. }
  5030. static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
  5031. {
  5032. return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
  5033. _mm_xor_si128(b, setmin_epi16()));
  5034. }
  5035. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
  5036. {
  5037. return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
  5038. _mm_xor_si128(b, setmin_epi16()));
  5039. }
  5040. static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
  5041. {
  5042. return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
  5043. _mm_xor_si128(b, setmin_epi32()));
  5044. }
  5045. static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
  5046. {
  5047. return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
  5048. _mm_xor_si128(b, setmin_epi32()));
  5049. }
  5050. Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
  5051. {
  5052. #ifdef Vc_IMPL_SSE4_2
  5053. return _mm_cmpgt_epi64(a, b);
  5054. #else
  5055. const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
  5056. const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
  5057. const auto gt = _mm_cmpgt_epi32(aa, bb);
  5058. const auto eq = _mm_cmpeq_epi32(aa, bb);
  5059. const auto gt2 =
  5060. _mm_shuffle_epi32(gt, 0xf5);
  5061. const auto lo =
  5062. _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
  5063. return _mm_or_si128(gt2, lo);
  5064. #endif
  5065. }
  5066. #endif
  5067. }
  5068. }
  5069. #ifdef Vc_IMPL_SSSE3
  5070. namespace Vc_VERSIONED_NAMESPACE
  5071. {
  5072. namespace SseIntrinsics
  5073. {
  5074. Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
  5075. Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
  5076. Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
  5077. template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
  5078. {
  5079. return _mm_alignr_epi8(a, b, s & 0x1fu);
  5080. }
  5081. }
  5082. }
  5083. #else
  5084. namespace Vc_VERSIONED_NAMESPACE
  5085. {
  5086. namespace SseIntrinsics
  5087. {
  5088. Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
  5089. __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
  5090. return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1)));
  5091. }
  5092. Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
  5093. __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
  5094. return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
  5095. }
  5096. Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
  5097. __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
  5098. return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
  5099. }
  5100. template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
  5101. {
  5102. switch (s & 0x1fu) {
  5103. case 0: return b;
  5104. case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
  5105. case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
  5106. case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
  5107. case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
  5108. case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
  5109. case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
  5110. case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
  5111. case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
  5112. case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
  5113. case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
  5114. case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
  5115. case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
  5116. case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
  5117. case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
  5118. case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
  5119. case 16: return a;
  5120. case 17: return _mm_srli_si128(a, 1);
  5121. case 18: return _mm_srli_si128(a, 2);
  5122. case 19: return _mm_srli_si128(a, 3);
  5123. case 20: return _mm_srli_si128(a, 4);
  5124. case 21: return _mm_srli_si128(a, 5);
  5125. case 22: return _mm_srli_si128(a, 6);
  5126. case 23: return _mm_srli_si128(a, 7);
  5127. case 24: return _mm_srli_si128(a, 8);
  5128. case 25: return _mm_srli_si128(a, 9);
  5129. case 26: return _mm_srli_si128(a, 10);
  5130. case 27: return _mm_srli_si128(a, 11);
  5131. case 28: return _mm_srli_si128(a, 12);
  5132. case 29: return _mm_srli_si128(a, 13);
  5133. case 30: return _mm_srli_si128(a, 14);
  5134. case 31: return _mm_srli_si128(a, 15);
  5135. }
  5136. return _mm_setzero_si128();
  5137. }
  5138. }
  5139. }
  5140. #endif
  5141. #ifdef Vc_IMPL_SSE4_1
  5142. namespace Vc_VERSIONED_NAMESPACE
  5143. {
  5144. namespace SseIntrinsics
  5145. {
  5146. Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
  5147. {
  5148. return _mm_cmpeq_epi64(a, b);
  5149. }
  5150. template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
  5151. {
  5152. return _mm_extract_epi32(v, index);
  5153. }
  5154. Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
  5155. {
  5156. return _mm_blendv_pd(a, b, c);
  5157. }
  5158. Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
  5159. {
  5160. return _mm_blendv_ps(a, b, c);
  5161. }
  5162. Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
  5163. {
  5164. return _mm_blendv_epi8(a, b, c);
  5165. }
  5166. template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
  5167. {
  5168. return _mm_blend_pd(a, b, mask);
  5169. }
  5170. template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
  5171. {
  5172. return _mm_blend_ps(a, b, mask);
  5173. }
  5174. template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
  5175. {
  5176. return _mm_blend_epi16(a, b, mask);
  5177. }
  5178. Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
  5179. {
  5180. return _mm_max_epi8(a, b);
  5181. }
  5182. Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
  5183. {
  5184. return _mm_max_epi32(a, b);
  5185. }
  5186. Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
  5187. {
  5188. return _mm_max_epu16(a, b);
  5189. }
  5190. Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
  5191. {
  5192. return _mm_max_epu32(a, b);
  5193. }
  5194. Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
  5195. {
  5196. return _mm_min_epu16(a, b);
  5197. }
  5198. Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
  5199. {
  5200. return _mm_min_epu32(a, b);
  5201. }
  5202. Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
  5203. {
  5204. return _mm_min_epi8(a, b);
  5205. }
  5206. Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
  5207. {
  5208. return _mm_min_epi32(a, b);
  5209. }
  5210. Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
  5211. {
  5212. return _mm_cvtepu8_epi16(epu8);
  5213. }
  5214. Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
  5215. {
  5216. return _mm_cvtepi8_epi16(epi8);
  5217. }
  5218. Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
  5219. {
  5220. return _mm_cvtepu16_epi32(epu16);
  5221. }
  5222. Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
  5223. {
  5224. return _mm_cvtepi16_epi32(epu16);
  5225. }
  5226. Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
  5227. {
  5228. return _mm_cvtepu8_epi32(epu8);
  5229. }
  5230. Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
  5231. {
  5232. return _mm_cvtepi8_epi32(epi8);
  5233. }
  5234. }
  5235. }
  5236. #else
  5237. namespace Vc_VERSIONED_NAMESPACE
  5238. {
  5239. namespace SseIntrinsics
  5240. {
  5241. Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
  5242. auto tmp = _mm_cmpeq_epi32(a, b);
  5243. return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
  5244. }
  5245. template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
  5246. {
  5247. #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
  5248. typedef int int32v4 __attribute__((__vector_size__(16)));
  5249. return aliasing_cast<int32v4>(v)[index];
  5250. #else
  5251. return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
  5252. #endif
  5253. }
  5254. Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
  5255. #ifdef Vc_GCC
  5256. return reinterpret_cast<__m128d>(
  5257. (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
  5258. (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
  5259. #else
  5260. return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
  5261. #endif
  5262. }
  5263. Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) {
  5264. #ifdef Vc_GCC
  5265. return reinterpret_cast<__m128>(
  5266. (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
  5267. (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
  5268. #else
  5269. return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
  5270. #endif
  5271. }
  5272. Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
  5273. #ifdef Vc_GCC
  5274. return (~c & a) | (c & b);
  5275. #else
  5276. return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
  5277. #endif
  5278. }
  5279. template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
  5280. {
  5281. switch (mask) {
  5282. case 0x0:
  5283. return a;
  5284. case 0x1:
  5285. return _mm_shuffle_pd(b, a, 2);
  5286. case 0x2:
  5287. return _mm_shuffle_pd(a, b, 2);
  5288. case 0x3:
  5289. return b;
  5290. default:
  5291. abort();
  5292. return a;
  5293. }
  5294. }
  5295. template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
  5296. {
  5297. __m128i c;
  5298. switch (mask) {
  5299. case 0x0:
  5300. return a;
  5301. case 0x1:
  5302. c = _mm_srli_si128(_mm_setallone_si128(), 12);
  5303. break;
  5304. case 0x2:
  5305. c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
  5306. break;
  5307. case 0x3:
  5308. c = _mm_srli_si128(_mm_setallone_si128(), 8);
  5309. break;
  5310. case 0x4:
  5311. c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
  5312. break;
  5313. case 0x5:
  5314. c = _mm_set_epi32(0, -1, 0, -1);
  5315. break;
  5316. case 0x6:
  5317. c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
  5318. break;
  5319. case 0x7:
  5320. c = _mm_srli_si128(_mm_setallone_si128(), 4);
  5321. break;
  5322. case 0x8:
  5323. c = _mm_slli_si128(_mm_setallone_si128(), 12);
  5324. break;
  5325. case 0x9:
  5326. c = _mm_set_epi32(-1, 0, 0, -1);
  5327. break;
  5328. case 0xa:
  5329. c = _mm_set_epi32(-1, 0, -1, 0);
  5330. break;
  5331. case 0xb:
  5332. c = _mm_set_epi32(-1, 0, -1, -1);
  5333. break;
  5334. case 0xc:
  5335. c = _mm_slli_si128(_mm_setallone_si128(), 8);
  5336. break;
  5337. case 0xd:
  5338. c = _mm_set_epi32(-1, -1, 0, -1);
  5339. break;
  5340. case 0xe:
  5341. c = _mm_slli_si128(_mm_setallone_si128(), 4);
  5342. break;
  5343. case 0xf:
  5344. return b;
  5345. default:
  5346. abort();
  5347. c = _mm_setzero_si128();
  5348. break;
  5349. }
  5350. __m128 _c = _mm_castsi128_ps(c);
  5351. return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
  5352. }
  5353. template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
  5354. {
  5355. __m128i c;
  5356. switch (mask) {
  5357. case 0x00:
  5358. return a;
  5359. case 0x01:
  5360. c = _mm_srli_si128(_mm_setallone_si128(), 14);
  5361. break;
  5362. case 0x03:
  5363. c = _mm_srli_si128(_mm_setallone_si128(), 12);
  5364. break;
  5365. case 0x07:
  5366. c = _mm_srli_si128(_mm_setallone_si128(), 10);
  5367. break;
  5368. case 0x0f:
  5369. return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
  5370. case 0x1f:
  5371. c = _mm_srli_si128(_mm_setallone_si128(), 6);
  5372. break;
  5373. case 0x3f:
  5374. c = _mm_srli_si128(_mm_setallone_si128(), 4);
  5375. break;
  5376. case 0x7f:
  5377. c = _mm_srli_si128(_mm_setallone_si128(), 2);
  5378. break;
  5379. case 0x80:
  5380. c = _mm_slli_si128(_mm_setallone_si128(), 14);
  5381. break;
  5382. case 0xc0:
  5383. c = _mm_slli_si128(_mm_setallone_si128(), 12);
  5384. break;
  5385. case 0xe0:
  5386. c = _mm_slli_si128(_mm_setallone_si128(), 10);
  5387. break;
  5388. case 0xf0:
  5389. c = _mm_slli_si128(_mm_setallone_si128(), 8);
  5390. break;
  5391. case 0xf8:
  5392. c = _mm_slli_si128(_mm_setallone_si128(), 6);
  5393. break;
  5394. case 0xfc:
  5395. c = _mm_slli_si128(_mm_setallone_si128(), 4);
  5396. break;
  5397. case 0xfe:
  5398. c = _mm_slli_si128(_mm_setallone_si128(), 2);
  5399. break;
  5400. case 0xff:
  5401. return b;
  5402. case 0xcc:
  5403. return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
  5404. case 0x33:
  5405. return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
  5406. default:
  5407. const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
  5408. c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
  5409. break;
  5410. }
  5411. return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
  5412. }
  5413. Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
  5414. return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
  5415. }
  5416. Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
  5417. return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
  5418. }
  5419. Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
  5420. return blendv_epi8(b, a, cmpgt_epu16(a, b));
  5421. }
  5422. Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
  5423. return blendv_epi8(b, a, cmpgt_epu32(a, b));
  5424. }
  5425. Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
  5426. return blendv_epi8(a, b, cmpgt_epu16(a, b));
  5427. }
  5428. Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
  5429. return blendv_epi8(a, b, cmpgt_epu32(a, b));
  5430. }
  5431. Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
  5432. return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
  5433. }
  5434. Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
  5435. return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
  5436. }
  5437. Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
  5438. return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
  5439. }
  5440. Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
  5441. return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
  5442. }
  5443. Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
  5444. return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
  5445. }
  5446. Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
  5447. return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
  5448. }
  5449. Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
  5450. return cvtepu16_epi32(cvtepu8_epi16(epu8));
  5451. }
  5452. Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
  5453. const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
  5454. const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
  5455. return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
  5456. }
  5457. }
  5458. }
  5459. #endif
  5460. namespace Vc_VERSIONED_NAMESPACE
  5461. {
  5462. namespace SseIntrinsics
  5463. {
  5464. static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
  5465. #ifdef Vc_IMPL_SSE4_1
  5466. return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
  5467. #else
  5468. return _mm_load_ps(mem);
  5469. #endif
  5470. }
  5471. static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
  5472. #ifdef Vc_IMPL_SSE4_1
  5473. return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
  5474. #else
  5475. return _mm_load_pd(mem);
  5476. #endif
  5477. }
  5478. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
  5479. #ifdef Vc_IMPL_SSE4_1
  5480. return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
  5481. #else
  5482. return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
  5483. #endif
  5484. }
  5485. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
  5486. return _mm_stream_load(reinterpret_cast<const int *>(mem));
  5487. }
  5488. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
  5489. return _mm_stream_load(reinterpret_cast<const int *>(mem));
  5490. }
  5491. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
  5492. return _mm_stream_load(reinterpret_cast<const int *>(mem));
  5493. }
  5494. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
  5495. return _mm_stream_load(reinterpret_cast<const int *>(mem));
  5496. }
  5497. static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
  5498. return _mm_stream_load(reinterpret_cast<const int *>(mem));
  5499. }
  5500. #ifndef __x86_64__
  5501. Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
  5502. return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
  5503. }
  5504. #endif
  5505. #ifdef Vc_IMPL_AVX2
  5506. template <int Scale> __m128 gather(const float *addr, __m128i idx)
  5507. {
  5508. return _mm_i32gather_ps(addr, idx, Scale);
  5509. }
  5510. template <int Scale> __m128d gather(const double *addr, __m128i idx)
  5511. {
  5512. return _mm_i32gather_pd(addr, idx, Scale);
  5513. }
  5514. template <int Scale> __m128i gather(const int *addr, __m128i idx)
  5515. {
  5516. return _mm_i32gather_epi32(addr, idx, Scale);
  5517. }
  5518. template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
  5519. {
  5520. return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
  5521. }
  5522. template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
  5523. {
  5524. return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
  5525. }
  5526. template <int Scale>
  5527. __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
  5528. {
  5529. return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
  5530. }
  5531. template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
  5532. {
  5533. return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
  5534. }
  5535. template <int Scale>
  5536. __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
  5537. {
  5538. return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
  5539. }
  5540. #endif
  5541. }
  5542. }
  5543. namespace Vc_VERSIONED_NAMESPACE
  5544. {
  5545. namespace SSE
  5546. {
  5547. using namespace SseIntrinsics;
  5548. template <typename T> struct ParameterHelper
  5549. {
  5550. typedef T ByValue;
  5551. typedef T &Reference;
  5552. typedef const T &ConstRef;
  5553. };
  5554. template <typename T> struct VectorHelper
  5555. {
  5556. };
  5557. template <typename T> struct VectorTypeHelper
  5558. {
  5559. typedef __m128i Type;
  5560. };
  5561. template <> struct VectorTypeHelper<double>
  5562. {
  5563. typedef __m128d Type;
  5564. };
  5565. template <> struct VectorTypeHelper<float>
  5566. {
  5567. typedef __m128 Type;
  5568. };
  5569. template <typename T> struct DetermineGatherMask
  5570. {
  5571. typedef T Type;
  5572. };
  5573. template <typename T> struct VectorTraits
  5574. {
  5575. typedef typename VectorTypeHelper<T>::Type VectorType;
  5576. using EntryType = T;
  5577. static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
  5578. typedef Mask<T> MaskType;
  5579. typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
  5580. typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
  5581. };
  5582. template <typename T> struct VectorHelperSize;
  5583. }
  5584. }
  5585. #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
  5586. #pragma GCC diagnostic pop
  5587. #endif
  5588. #ifndef VC_SSE_SHUFFLE_H_
  5589. #define VC_SSE_SHUFFLE_H_
  5590. namespace Vc_VERSIONED_NAMESPACE
  5591. {
  5592. enum VecPos {
  5593. X0, X1, X2, X3, X4, X5, X6, X7,
  5594. Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
  5595. Const0
  5596. };
  5597. namespace Mem
  5598. {
  5599. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
  5600. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
  5601. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
  5602. return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
  5603. }
  5604. template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
  5605. static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
  5606. static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
  5607. return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
  5608. }
  5609. template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
  5610. Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
  5611. {
  5612. return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
  5613. _mm_castsi128_ps(y)));
  5614. }
  5615. template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
  5616. static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
  5617. static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
  5618. return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
  5619. }
  5620. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
  5621. static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
  5622. static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
  5623. static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
  5624. static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
  5625. return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
  5626. (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
  5627. }
  5628. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  5629. static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
  5630. static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
  5631. static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
  5632. static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
  5633. static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
  5634. static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
  5635. static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
  5636. static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
  5637. static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
  5638. return Vc::SseIntrinsics::blend_epi16<
  5639. (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
  5640. (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
  5641. (Dst7 / Y7) * 128>(x, y);
  5642. }
  5643. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
  5644. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  5645. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  5646. return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  5647. }
  5648. template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
  5649. static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
  5650. static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
  5651. return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
  5652. }
  5653. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
  5654. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  5655. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  5656. return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  5657. }
  5658. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
  5659. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  5660. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  5661. return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  5662. }
  5663. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
  5664. static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
  5665. static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
  5666. return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
  5667. }
  5668. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  5669. static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
  5670. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  5671. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  5672. static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
  5673. static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
  5674. if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
  5675. x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  5676. }
  5677. if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
  5678. x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
  5679. }
  5680. return x;
  5681. }
  5682. }
  5683. namespace Reg
  5684. {
  5685. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
  5686. return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
  5687. }
  5688. template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
  5689. return Mem::shuffle<Dst0, Dst1>(x, y);
  5690. }
  5691. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
  5692. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  5693. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  5694. return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  5695. }
  5696. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
  5697. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
  5698. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
  5699. return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
  5700. }
  5701. template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
  5702. return Mem::blend<Dst0, Dst1>(x, y);
  5703. }
  5704. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
  5705. return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
  5706. }
  5707. }
  5708. }
  5709. #endif
  5710. #endif
  5711. #ifndef VC_SSE_VECTORHELPER_H_
  5712. #define VC_SSE_VECTORHELPER_H_
  5713. #include <limits>
  5714. namespace Vc_VERSIONED_NAMESPACE
  5715. {
  5716. namespace SSE
  5717. {
  5718. #define Vc_OP0(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
  5719. #define Vc_OP1(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
  5720. #define Vc_OP2(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
  5721. #define Vc_OP3(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
  5722. template<> struct VectorHelper<__m128>
  5723. {
  5724. typedef __m128 VectorType;
  5725. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); }
  5726. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
  5727. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
  5728. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); }
  5729. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
  5730. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); }
  5731. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
  5732. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
  5733. Vc_OP0(allone, _mm_setallone_ps())
  5734. Vc_OP0(zero, _mm_setzero_ps())
  5735. Vc_OP3(blend, blendv_ps(a, b, c))
  5736. };
  5737. template<> struct VectorHelper<__m128d>
  5738. {
  5739. typedef __m128d VectorType;
  5740. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); }
  5741. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
  5742. template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
  5743. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); }
  5744. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
  5745. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); }
  5746. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
  5747. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
  5748. Vc_OP0(allone, _mm_setallone_pd())
  5749. Vc_OP0(zero, _mm_setzero_pd())
  5750. Vc_OP3(blend, blendv_pd(a, b, c))
  5751. };
  5752. template<> struct VectorHelper<__m128i>
  5753. {
  5754. typedef __m128i VectorType;
  5755. template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
  5756. template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
  5757. template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
  5758. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
  5759. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
  5760. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
  5761. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
  5762. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
  5763. Vc_OP0(allone, _mm_setallone_si128())
  5764. Vc_OP0(zero, _mm_setzero_si128())
  5765. Vc_OP3(blend, blendv_epi8(a, b, c))
  5766. };
  5767. #undef Vc_OP1
  5768. #undef Vc_OP2
  5769. #undef Vc_OP3
  5770. #define Vc_OP1(op) \
  5771. static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
  5772. #define Vc_OP(op) \
  5773. static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
  5774. #define Vc_OP_(op) \
  5775. static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); }
  5776. #define Vc_OPx(op,op2) \
  5777. static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
  5778. #define Vc_OP_CAST_(op) \
  5779. static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
  5780. _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
  5781. Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
  5782. }
  5783. #define Vc_MINMAX \
  5784. static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
  5785. static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
  5786. template<> struct VectorHelper<double> {
  5787. typedef __m128d VectorType;
  5788. typedef double EntryType;
  5789. #define Vc_SUFFIX pd
  5790. Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
  5791. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
  5792. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  5793. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
  5794. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  5795. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  5796. #ifdef Vc_IMPL_FMA4
  5797. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
  5798. v1 = _mm_macc_pd(v1, v2, v3);
  5799. }
  5800. #else
  5801. static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
  5802. VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
  5803. VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
  5804. #if defined(Vc_GCC) && Vc_GCC < 0x40703
  5805. asm("":"+x"(h1), "+x"(h2));
  5806. #endif
  5807. const VectorType l1 = _mm_sub_pd(v1, h1);
  5808. const VectorType l2 = _mm_sub_pd(v2, h2);
  5809. const VectorType ll = mul(l1, l2);
  5810. const VectorType lh = add(mul(l1, h2), mul(h1, l2));
  5811. const VectorType hh = mul(h1, h2);
  5812. const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3));
  5813. const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
  5814. const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
  5815. v1 = add(add(ll, b), add(c, hh));
  5816. }
  5817. #endif
  5818. Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
  5819. Vc_OP1(sqrt)
  5820. static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
  5821. return _mm_div_pd(one(), sqrt(x));
  5822. }
  5823. static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
  5824. return _mm_div_pd(one(), x);
  5825. }
  5826. static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
  5827. return _mm_cmpunord_pd(x, x);
  5828. }
  5829. static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
  5830. return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
  5831. }
  5832. static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
  5833. return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
  5834. }
  5835. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
  5836. return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
  5837. }
  5838. Vc_MINMAX
  5839. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  5840. a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
  5841. return _mm_cvtsd_f64(a);
  5842. }
  5843. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  5844. a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
  5845. return _mm_cvtsd_f64(a);
  5846. }
  5847. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  5848. a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
  5849. return _mm_cvtsd_f64(a);
  5850. }
  5851. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  5852. a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
  5853. return _mm_cvtsd_f64(a);
  5854. }
  5855. #undef Vc_SUFFIX
  5856. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
  5857. #ifdef Vc_IMPL_SSE4_1
  5858. return _mm_round_pd(a, _MM_FROUND_NINT);
  5859. #else
  5860. return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
  5861. #endif
  5862. }
  5863. };
  5864. template<> struct VectorHelper<float> {
  5865. typedef float EntryType;
  5866. typedef __m128 VectorType;
  5867. #define Vc_SUFFIX ps
  5868. Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
  5869. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
  5870. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  5871. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
  5872. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  5873. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  5874. static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
  5875. #ifdef Vc_IMPL_FMA4
  5876. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
  5877. v1 = _mm_macc_ps(v1, v2, v3);
  5878. }
  5879. #else
  5880. static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
  5881. __m128d v1_0 = _mm_cvtps_pd(v1);
  5882. __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
  5883. __m128d v2_0 = _mm_cvtps_pd(v2);
  5884. __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
  5885. __m128d v3_0 = _mm_cvtps_pd(v3);
  5886. __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
  5887. v1 = _mm_movelh_ps(
  5888. _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
  5889. _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
  5890. }
  5891. #endif
  5892. Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
  5893. Vc_OP1(sqrt) Vc_OP1(rsqrt)
  5894. static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
  5895. return _mm_cmpunord_ps(x, x);
  5896. }
  5897. static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
  5898. return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
  5899. }
  5900. static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
  5901. return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
  5902. }
  5903. static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
  5904. return _mm_rcp_ps(x);
  5905. }
  5906. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
  5907. return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
  5908. }
  5909. Vc_MINMAX
  5910. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  5911. a = _mm_min_ps(a, _mm_movehl_ps(a, a));
  5912. a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  5913. return _mm_cvtss_f32(a);
  5914. }
  5915. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  5916. a = _mm_max_ps(a, _mm_movehl_ps(a, a));
  5917. a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  5918. return _mm_cvtss_f32(a);
  5919. }
  5920. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  5921. a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
  5922. a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
  5923. return _mm_cvtss_f32(a);
  5924. }
  5925. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  5926. a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
  5927. a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
  5928. return _mm_cvtss_f32(a);
  5929. }
  5930. #undef Vc_SUFFIX
  5931. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
  5932. #ifdef Vc_IMPL_SSE4_1
  5933. return _mm_round_ps(a, _MM_FROUND_NINT);
  5934. #else
  5935. return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
  5936. #endif
  5937. }
  5938. };
  5939. template<> struct VectorHelper<int> {
  5940. typedef int EntryType;
  5941. typedef __m128i VectorType;
  5942. #define Vc_SUFFIX si128
  5943. Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
  5944. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  5945. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
  5946. #undef Vc_SUFFIX
  5947. #define Vc_SUFFIX epi32
  5948. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  5949. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  5950. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
  5951. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
  5952. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
  5953. return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
  5954. }
  5955. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
  5956. return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
  5957. }
  5958. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
  5959. static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
  5960. static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
  5961. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  5962. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5963. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5964. return _mm_cvtsi128_si32(a);
  5965. }
  5966. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  5967. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5968. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5969. return _mm_cvtsi128_si32(a);
  5970. }
  5971. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  5972. a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5973. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5974. return _mm_cvtsi128_si32(a);
  5975. }
  5976. #ifdef Vc_IMPL_SSE4_1
  5977. static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
  5978. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  5979. a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5980. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  5981. return _mm_cvtsi128_si32(a);
  5982. }
  5983. #else
  5984. static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
  5985. const VectorType aShift = _mm_srli_si128(a, 4);
  5986. const VectorType ab02 = _mm_mul_epu32(a, b);
  5987. const VectorType bShift = _mm_srli_si128(b, 4);
  5988. const VectorType ab13 = _mm_mul_epu32(aShift, bShift);
  5989. return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
  5990. }
  5991. #endif
  5992. Vc_OP(add) Vc_OP(sub)
  5993. #undef Vc_SUFFIX
  5994. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
  5995. };
  5996. template<> struct VectorHelper<unsigned int> {
  5997. typedef unsigned int EntryType;
  5998. typedef __m128i VectorType;
  5999. #define Vc_SUFFIX si128
  6000. Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
  6001. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  6002. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
  6003. #undef Vc_SUFFIX
  6004. #define Vc_SUFFIX epu32
  6005. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  6006. static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
  6007. static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
  6008. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  6009. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6010. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6011. return _mm_cvtsi128_si32(a);
  6012. }
  6013. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  6014. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6015. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6016. return _mm_cvtsi128_si32(a);
  6017. }
  6018. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  6019. a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6020. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6021. return _mm_cvtsi128_si32(a);
  6022. }
  6023. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  6024. a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6025. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6026. return _mm_cvtsi128_si32(a);
  6027. }
  6028. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
  6029. static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
  6030. return VectorHelper<int>::mul(a, b);
  6031. }
  6032. #undef Vc_SUFFIX
  6033. #define Vc_SUFFIX epi32
  6034. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
  6035. return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
  6036. }
  6037. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
  6038. return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
  6039. }
  6040. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  6041. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
  6042. Vc_OP(add) Vc_OP(sub)
  6043. #undef Vc_SUFFIX
  6044. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
  6045. };
  6046. template<> struct VectorHelper<signed short> {
  6047. typedef __m128i VectorType;
  6048. typedef signed short EntryType;
  6049. #define Vc_SUFFIX si128
  6050. Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
  6051. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  6052. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
  6053. static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
  6054. static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
  6055. static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
  6056. #undef Vc_SUFFIX
  6057. #define Vc_SUFFIX epi16
  6058. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  6059. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
  6060. return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
  6061. }
  6062. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
  6063. return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
  6064. }
  6065. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  6066. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
  6067. const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
  6068. return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
  6069. }
  6070. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
  6071. v1 = add(mul(v1, v2), v3); }
  6072. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
  6073. Vc_OPx(mul, mullo)
  6074. Vc_OP(min) Vc_OP(max)
  6075. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  6076. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6077. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6078. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6079. return _mm_cvtsi128_si32(a);
  6080. }
  6081. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  6082. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6083. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6084. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6085. return _mm_cvtsi128_si32(a);
  6086. }
  6087. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  6088. a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6089. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6090. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6091. return _mm_cvtsi128_si32(a);
  6092. }
  6093. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  6094. a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6095. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6096. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6097. return _mm_cvtsi128_si32(a);
  6098. }
  6099. Vc_OP(add) Vc_OP(sub)
  6100. #undef Vc_SUFFIX
  6101. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
  6102. };
  6103. template<> struct VectorHelper<unsigned short> {
  6104. typedef __m128i VectorType;
  6105. typedef unsigned short EntryType;
  6106. #define Vc_SUFFIX si128
  6107. Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
  6108. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
  6109. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
  6110. #ifdef Vc_IMPL_SSE4_1
  6111. static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
  6112. #else
  6113. static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
  6114. auto tmp0 = _mm_unpacklo_epi16(a, b);
  6115. auto tmp1 = _mm_unpackhi_epi16(a, b);
  6116. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  6117. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  6118. return _mm_unpacklo_epi16(tmp2, tmp3);
  6119. }
  6120. #endif
  6121. static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
  6122. static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
  6123. #undef Vc_SUFFIX
  6124. #define Vc_SUFFIX epu16
  6125. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
  6126. #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
  6127. static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
  6128. static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
  6129. #endif
  6130. #undef Vc_SUFFIX
  6131. #define Vc_SUFFIX epi16
  6132. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
  6133. return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
  6134. }
  6135. static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
  6136. return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
  6137. }
  6138. static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
  6139. Vc_OPx(mul, mullo)
  6140. #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
  6141. Vc_OP(min) Vc_OP(max)
  6142. #endif
  6143. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
  6144. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6145. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6146. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6147. return _mm_cvtsi128_si32(a);
  6148. }
  6149. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
  6150. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6151. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6152. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6153. return _mm_cvtsi128_si32(a);
  6154. }
  6155. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
  6156. a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6157. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6158. a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6159. return _mm_cvtsi128_si32(a);
  6160. }
  6161. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
  6162. a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6163. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
  6164. a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
  6165. return _mm_cvtsi128_si32(a);
  6166. }
  6167. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
  6168. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
  6169. const EntryType d, const EntryType e, const EntryType f,
  6170. const EntryType g, const EntryType h) {
  6171. return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
  6172. }
  6173. Vc_OP(add) Vc_OP(sub)
  6174. #undef Vc_SUFFIX
  6175. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
  6176. };
  6177. #undef Vc_OP1
  6178. #undef Vc_OP
  6179. #undef Vc_OP_
  6180. #undef Vc_OPx
  6181. #undef Vc_OP_CAST_
  6182. #undef Vc_MINMAX
  6183. }
  6184. }
  6185. #endif
  6186. #ifndef VC_SSE_MASK_H_
  6187. #define VC_SSE_MASK_H_
  6188. #ifndef VC_SSE_DETAIL_H_
  6189. #define VC_SSE_DETAIL_H_
  6190. #ifndef VC_SSE_CASTS_H_
  6191. #define VC_SSE_CASTS_H_
  6192. namespace Vc_VERSIONED_NAMESPACE
  6193. {
  6194. namespace SSE
  6195. {
  6196. using uint = unsigned int;
  6197. using ushort = unsigned short;
  6198. using uchar = unsigned char;
  6199. using schar = signed char;
  6200. template <typename To, typename From> Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v)
  6201. {
  6202. return v;
  6203. }
  6204. template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128 v) { return _mm_castps_si128(v); }
  6205. template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); }
  6206. template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v); }
  6207. template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); }
  6208. template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); }
  6209. template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128 v) { return _mm_castps_pd(v); }
  6210. template <typename From, typename To> struct ConvertTag
  6211. {
  6212. };
  6213. template <typename From, typename To>
  6214. Vc_INTRINSIC typename VectorTraits<To>::VectorType convert(
  6215. typename VectorTraits<From>::VectorType v)
  6216. {
  6217. return convert(v, ConvertTag<From, To>());
  6218. }
  6219. Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , int >) { return _mm_cvttps_epi32(v); }
  6220. Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int >) { return _mm_cvttpd_epi32(v); }
  6221. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , int >) { return v; }
  6222. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , int >) { return v; }
  6223. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int >) {
  6224. #ifdef Vc_IMPL_SSE4_1
  6225. return _mm_cvtepi16_epi32(v);
  6226. #else
  6227. return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16);
  6228. #endif
  6229. }
  6230. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int >) {
  6231. #ifdef Vc_IMPL_SSE4_1
  6232. return _mm_cvtepu16_epi32(v);
  6233. #else
  6234. return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16);
  6235. #endif
  6236. }
  6237. Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , uint >) {
  6238. return _mm_castps_si128(
  6239. blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
  6240. _mm_castsi128_ps(_mm_xor_si128(
  6241. _mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))),
  6242. _mm_set1_epi32(1 << 31))),
  6243. _mm_cmpge_ps(v, _mm_set1_ps(1u << 31))));
  6244. }
  6245. Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint >) {
  6246. #ifdef Vc_IMPL_SSE4_1
  6247. return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))),
  6248. _mm_cvtsi64_si128(0x8000000080000000ull));
  6249. #else
  6250. return blendv_epi8(_mm_cvttpd_epi32(v),
  6251. _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))),
  6252. _mm_cvtsi64_si128(0x8000000080000000ull)),
  6253. _mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u))));
  6254. #endif
  6255. }
  6256. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , uint >) { return v; }
  6257. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , uint >) { return v; }
  6258. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint >) { return convert(v, ConvertTag<short, int>()); }
  6259. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint >) { return convert(v, ConvertTag<ushort, int>()); }
  6260. Vc_INTRINSIC __m128 convert(__m128 v, ConvertTag<float , float >) { return v; }
  6261. Vc_INTRINSIC __m128 convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
  6262. Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<int , float >) { return _mm_cvtepi32_ps(v); }
  6263. Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<uint , float >) {
  6264. using namespace SSE;
  6265. return blendv_ps(_mm_cvtepi32_ps(v),
  6266. _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))),
  6267. _mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128(
  6268. v, _mm_set1_epi32(0x000001ff))))),
  6269. _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())));
  6270. }
  6271. Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<short , float >) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, float>()); }
  6272. Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<ushort, float >) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, float>()); }
  6273. Vc_INTRINSIC __m128d convert(__m128 v, ConvertTag<float , double>) { return _mm_cvtps_pd(v); }
  6274. Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag<double, double>) { return v; }
  6275. Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<int , double>) { return _mm_cvtepi32_pd(v); }
  6276. Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<uint , double>) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); }
  6277. Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, double>()); }
  6278. Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
  6279. Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , short >) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); }
  6280. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
  6281. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
  6282. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , short >) { return v; }
  6283. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, short >) { return v; }
  6284. Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, short >) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, short>()); }
  6285. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , ushort>) {
  6286. auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
  6287. auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
  6288. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  6289. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  6290. return _mm_unpacklo_epi16(tmp2, tmp3);
  6291. }
  6292. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , ushort>) {
  6293. auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
  6294. auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
  6295. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  6296. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  6297. return _mm_unpacklo_epi16(tmp2, tmp3);
  6298. }
  6299. Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , ushort>) { return convert(_mm_cvttps_epi32(v), ConvertTag<int, ushort>()); }
  6300. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , ushort>) { return v; }
  6301. Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, ushort>) { return v; }
  6302. Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, ushort>()); }
  6303. }
  6304. }
  6305. #endif
  6306. #ifdef Vc_IMPL_AVX
  6307. #endif
  6308. namespace Vc_VERSIONED_NAMESPACE
  6309. {
  6310. namespace Detail
  6311. {
  6312. template <typename V, typename DstT> struct LoadTag
  6313. {
  6314. };
  6315. class when_aligned
  6316. {
  6317. public:
  6318. template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
  6319. {
  6320. }
  6321. };
  6322. class when_unaligned
  6323. {
  6324. public:
  6325. template <typename F>
  6326. constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
  6327. {
  6328. }
  6329. };
  6330. class when_streaming
  6331. {
  6332. public:
  6333. template <typename F>
  6334. constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
  6335. {
  6336. }
  6337. };
  6338. Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
  6339. {
  6340. return _mm_load_ps(mem);
  6341. }
  6342. Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
  6343. {
  6344. return _mm_loadu_ps(mem);
  6345. }
  6346. Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
  6347. {
  6348. return SseIntrinsics::_mm_stream_load(mem);
  6349. }
  6350. Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
  6351. {
  6352. return _mm_load_pd(mem);
  6353. }
  6354. Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
  6355. {
  6356. return _mm_loadu_pd(mem);
  6357. }
  6358. Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
  6359. {
  6360. return SseIntrinsics::_mm_stream_load(mem);
  6361. }
  6362. template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
  6363. {
  6364. static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
  6365. return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
  6366. }
  6367. template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
  6368. {
  6369. static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
  6370. return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
  6371. }
  6372. template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
  6373. {
  6374. static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
  6375. return SseIntrinsics::_mm_stream_load(mem);
  6376. }
  6377. #ifdef Vc_MSVC
  6378. template <typename V, typename DstT, typename F>
  6379. Vc_INTRINSIC __m128d load(const double *mem, F f,
  6380. enable_if<(std::is_same<DstT, double>::value &&
  6381. std::is_same<V, __m128d>::value)> = nullarg)
  6382. {
  6383. return load16(mem, f);
  6384. }
  6385. template <typename V, typename DstT, typename F>
  6386. Vc_INTRINSIC __m128 load(const float *mem, F f,
  6387. enable_if<(std::is_same<DstT, float>::value &&
  6388. std::is_same<V, __m128>::value)> = nullarg)
  6389. {
  6390. return load16(mem, f);
  6391. }
  6392. template <typename V, typename DstT, typename F>
  6393. Vc_INTRINSIC __m128i load(const uint *mem, F f,
  6394. enable_if<(std::is_same<DstT, uint>::value &&
  6395. std::is_same<V, __m128i>::value)> = nullarg)
  6396. {
  6397. return load16(mem, f);
  6398. }
  6399. template <typename V, typename DstT, typename F>
  6400. Vc_INTRINSIC __m128i load(const int *mem, F f,
  6401. enable_if<(std::is_same<DstT, int>::value &&
  6402. std::is_same<V, __m128i>::value)> = nullarg)
  6403. {
  6404. return load16(mem, f);
  6405. }
  6406. template <typename V, typename DstT, typename F>
  6407. Vc_INTRINSIC __m128i load(const short *mem, F f,
  6408. enable_if<(std::is_same<DstT, short>::value &&
  6409. std::is_same<V, __m128i>::value)> = nullarg)
  6410. {
  6411. return load16(mem, f);
  6412. }
  6413. template <typename V, typename DstT, typename F>
  6414. Vc_INTRINSIC __m128i load(const ushort *mem, F f,
  6415. enable_if<(std::is_same<DstT, ushort>::value &&
  6416. std::is_same<V, __m128i>::value)> = nullarg)
  6417. {
  6418. return load16(mem, f);
  6419. }
  6420. #endif
  6421. template <typename V, typename DstT, typename SrcT, typename Flags,
  6422. typename = enable_if<
  6423. #ifdef Vc_MSVC
  6424. !std::is_same<DstT, SrcT>::value &&
  6425. #endif
  6426. (!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
  6427. sizeof(DstT) >= sizeof(SrcT))>>
  6428. Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
  6429. {
  6430. return load(mem, flags, LoadTag<V, DstT>());
  6431. }
  6432. template <typename V, typename T, typename Flags>
  6433. Vc_INTRINSIC V
  6434. load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
  6435. {
  6436. return SSE::VectorHelper<V>::template load<Flags>(mem);
  6437. }
  6438. template <typename Flags>
  6439. Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
  6440. {
  6441. return SSE::VectorHelper<__m128i>::load<Flags>(mem);
  6442. }
  6443. template <typename Flags>
  6444. Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
  6445. {
  6446. return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6447. }
  6448. template <typename Flags>
  6449. Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
  6450. {
  6451. return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6452. }
  6453. template <typename Flags>
  6454. Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
  6455. {
  6456. return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6457. }
  6458. template <typename Flags>
  6459. Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
  6460. {
  6461. return SSE::VectorHelper<__m128i>::load<Flags>(mem);
  6462. }
  6463. template <typename Flags>
  6464. Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
  6465. {
  6466. return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6467. }
  6468. template <typename Flags>
  6469. Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
  6470. {
  6471. return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6472. }
  6473. template <typename Flags>
  6474. Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
  6475. {
  6476. return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
  6477. }
  6478. template <typename Flags>
  6479. Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
  6480. {
  6481. return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
  6482. }
  6483. template <typename Flags>
  6484. Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
  6485. {
  6486. return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6487. }
  6488. template <typename Flags>
  6489. Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
  6490. {
  6491. return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
  6492. }
  6493. template <typename Flags>
  6494. Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
  6495. {
  6496. return SSE::convert<float, double>(
  6497. _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
  6498. }
  6499. template <typename Flags>
  6500. Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
  6501. {
  6502. return SSE::convert<uint, double>(
  6503. _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6504. }
  6505. template <typename Flags>
  6506. Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
  6507. {
  6508. return SSE::convert<int, double>(
  6509. _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  6510. }
  6511. template <typename Flags>
  6512. Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
  6513. {
  6514. return SSE::convert<ushort, double>(
  6515. _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
  6516. }
  6517. template <typename Flags>
  6518. Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
  6519. {
  6520. return SSE::convert<short, double>(
  6521. _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
  6522. }
  6523. template <typename Flags>
  6524. Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
  6525. {
  6526. return SSE::convert<uchar, double>(
  6527. _mm_set1_epi16(*aliasing_cast<short>(mem)));
  6528. }
  6529. template <typename Flags>
  6530. Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
  6531. {
  6532. return SSE::convert<char, double>(
  6533. _mm_set1_epi16(*aliasing_cast<short>(mem)));
  6534. }
  6535. template <typename Flags>
  6536. Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
  6537. {
  6538. #ifdef Vc_IMPL_AVX
  6539. if (Flags::IsUnaligned) {
  6540. return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
  6541. } else if (Flags::IsStreaming) {
  6542. return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
  6543. } else {
  6544. return _mm256_cvtpd_ps(_mm256_load_pd(mem));
  6545. }
  6546. #else
  6547. return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
  6548. _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
  6549. #endif
  6550. }
  6551. template <typename Flags>
  6552. Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
  6553. {
  6554. return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
  6555. }
  6556. template <typename T, typename Flags,
  6557. typename = enable_if<!std::is_same<T, float>::value>>
  6558. Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
  6559. {
  6560. return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
  6561. }
  6562. template <int amount, typename T>
  6563. Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
  6564. {
  6565. return k;
  6566. }
  6567. template <int amount, typename T>
  6568. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
  6569. {
  6570. return _mm_srli_si128(k, amount);
  6571. }
  6572. template <int amount, typename T>
  6573. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
  6574. {
  6575. return _mm_slli_si128(k, -amount);
  6576. }
  6577. template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
  6578. {
  6579. if (Size == 4) {
  6580. return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
  6581. } else if (Size == 8) {
  6582. return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
  6583. } else if (Size == 16) {
  6584. return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
  6585. }
  6586. return 0;
  6587. }
  6588. Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
  6589. {
  6590. #ifdef Vc_IMPL_POPCNT
  6591. return _mm_popcnt_u32(n);
  6592. #else
  6593. n = (n & 0x5U) + ((n >> 1) & 0x5U);
  6594. n = (n & 0x3U) + ((n >> 2) & 0x3U);
  6595. return n;
  6596. #endif
  6597. }
  6598. Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
  6599. {
  6600. #ifdef Vc_IMPL_POPCNT
  6601. return _mm_popcnt_u32(n);
  6602. #else
  6603. n = (n & 0x55U) + ((n >> 1) & 0x55U);
  6604. n = (n & 0x33U) + ((n >> 2) & 0x33U);
  6605. n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
  6606. return n;
  6607. #endif
  6608. }
  6609. Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
  6610. {
  6611. #ifdef Vc_IMPL_POPCNT
  6612. return _mm_popcnt_u32(n);
  6613. #else
  6614. n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
  6615. n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
  6616. n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
  6617. n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
  6618. return n;
  6619. #endif
  6620. }
  6621. Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
  6622. {
  6623. #ifdef Vc_IMPL_POPCNT
  6624. return _mm_popcnt_u32(n);
  6625. #else
  6626. n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
  6627. n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
  6628. n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
  6629. n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
  6630. n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
  6631. return n;
  6632. #endif
  6633. }
  6634. template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
  6635. {
  6636. static_assert(From == To, "Incorrect mask cast.");
  6637. static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
  6638. return SSE::sse_cast<__m128>(k);
  6639. }
  6640. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
  6641. {
  6642. return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
  6643. }
  6644. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
  6645. {
  6646. return SSE::sse_cast<__m128>(
  6647. _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
  6648. }
  6649. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
  6650. {
  6651. return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
  6652. }
  6653. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
  6654. {
  6655. return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
  6656. }
  6657. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
  6658. {
  6659. const auto tmp = _mm_unpacklo_epi16(k, k);
  6660. return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
  6661. }
  6662. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
  6663. {
  6664. return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
  6665. }
  6666. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
  6667. {
  6668. return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
  6669. }
  6670. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
  6671. {
  6672. const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
  6673. return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
  6674. }
  6675. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
  6676. {
  6677. const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
  6678. return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
  6679. }
  6680. template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
  6681. template<> Vc_INTRINSIC Vc_CONST __m128 allone<__m128 >() { return SSE::_mm_setallone_ps(); }
  6682. template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
  6683. template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }
  6684. template <typename V> inline V zero();
  6685. template<> Vc_INTRINSIC Vc_CONST __m128 zero<__m128 >() { return _mm_setzero_ps(); }
  6686. template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
  6687. template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }
  6688. Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
  6689. {
  6690. return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
  6691. }
  6692. Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
  6693. {
  6694. return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
  6695. }
  6696. Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
  6697. {
  6698. #ifdef Vc_IMPL_SSSE3
  6699. return _mm_sign_epi32(v, allone<__m128i>());
  6700. #else
  6701. return _mm_sub_epi32(_mm_setzero_si128(), v);
  6702. #endif
  6703. }
  6704. Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
  6705. {
  6706. #ifdef Vc_IMPL_SSSE3
  6707. return _mm_sign_epi16(v, allone<__m128i>());
  6708. #else
  6709. return _mm_sub_epi16(_mm_setzero_si128(), v);
  6710. #endif
  6711. }
  6712. Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
  6713. Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
  6714. Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
  6715. Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
  6716. Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
  6717. Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
  6718. Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
  6719. Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
  6720. Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
  6721. Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
  6722. Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
  6723. Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
  6724. Vc_INTRINSIC __m128 not_(__m128 a) { return andnot_(a, allone<__m128 >()); }
  6725. Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
  6726. Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }
  6727. Vc_INTRINSIC __m128 add(__m128 a, __m128 b, float) { return _mm_add_ps(a, b); }
  6728. Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
  6729. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, int) { return _mm_add_epi32(a, b); }
  6730. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uint) { return _mm_add_epi32(a, b); }
  6731. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, short) { return _mm_add_epi16(a, b); }
  6732. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
  6733. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, schar) { return _mm_add_epi8 (a, b); }
  6734. Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uchar) { return _mm_add_epi8 (a, b); }
  6735. Vc_INTRINSIC __m128 sub(__m128 a, __m128 b, float) { return _mm_sub_ps(a, b); }
  6736. Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
  6737. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, int) { return _mm_sub_epi32(a, b); }
  6738. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uint) { return _mm_sub_epi32(a, b); }
  6739. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, short) { return _mm_sub_epi16(a, b); }
  6740. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
  6741. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, schar) { return _mm_sub_epi8 (a, b); }
  6742. Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uchar) { return _mm_sub_epi8 (a, b); }
  6743. Vc_INTRINSIC __m128 mul(__m128 a, __m128 b, float) { return _mm_mul_ps(a, b); }
  6744. Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
  6745. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, int) {
  6746. #ifdef Vc_IMPL_SSE4_1
  6747. return _mm_mullo_epi32(a, b);
  6748. #else
  6749. const __m128i aShift = _mm_srli_si128(a, 4);
  6750. const __m128i ab02 = _mm_mul_epu32(a, b);
  6751. const __m128i bShift = _mm_srli_si128(b, 4);
  6752. const __m128i ab13 = _mm_mul_epu32(aShift, bShift);
  6753. return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
  6754. #endif
  6755. }
  6756. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uint) { return mul(a, b, int()); }
  6757. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, short) { return _mm_mullo_epi16(a, b); }
  6758. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
  6759. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, schar) {
  6760. #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
  6761. using B = Common::BuiltinType<schar, 16>;
  6762. const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
  6763. return reinterpret_cast<const __m128i &>(x);
  6764. #else
  6765. return or_(
  6766. and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
  6767. _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
  6768. #endif
  6769. }
  6770. Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uchar) {
  6771. #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
  6772. using B = Common::BuiltinType<uchar, 16>;
  6773. const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
  6774. return reinterpret_cast<const __m128i &>(x);
  6775. #else
  6776. return or_(
  6777. and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
  6778. _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
  6779. #endif
  6780. }
  6781. Vc_INTRINSIC __m128 div(__m128 a, __m128 b, float) { return _mm_div_ps(a, b); }
  6782. Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }
  6783. Vc_INTRINSIC __m128 min(__m128 a, __m128 b, float) { return _mm_min_ps(a, b); }
  6784. Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
  6785. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, int) { return SSE::min_epi32(a, b); }
  6786. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uint) { return SSE::min_epu32(a, b); }
  6787. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, short) { return _mm_min_epi16(a, b); }
  6788. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
  6789. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, schar) { return SSE::min_epi8 (a, b); }
  6790. Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uchar) { return _mm_min_epu8 (a, b); }
  6791. Vc_INTRINSIC __m128 max(__m128 a, __m128 b, float) { return _mm_max_ps(a, b); }
  6792. Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
  6793. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, int) { return SSE::max_epi32(a, b); }
  6794. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uint) { return SSE::max_epu32(a, b); }
  6795. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, short) { return _mm_max_epi16(a, b); }
  6796. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
  6797. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, schar) { return SSE::max_epi8 (a, b); }
  6798. Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uchar) { return _mm_max_epu8 (a, b); }
  6799. Vc_INTRINSIC float add(__m128 a, float) {
  6800. a = _mm_add_ps(a, _mm_movehl_ps(a, a));
  6801. a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  6802. return _mm_cvtss_f32(a);
  6803. }
  6804. Vc_INTRINSIC double add(__m128d a, double) {
  6805. a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
  6806. return _mm_cvtsd_f64(a);
  6807. }
  6808. Vc_INTRINSIC int add(__m128i a, int) {
  6809. a = add(a, _mm_srli_si128(a, 8), int());
  6810. a = add(a, _mm_srli_si128(a, 4), int());
  6811. return _mm_cvtsi128_si32(a);
  6812. }
  6813. Vc_INTRINSIC uint add(__m128i a, uint) { return add(a, int()); }
  6814. Vc_INTRINSIC short add(__m128i a, short) {
  6815. a = add(a, _mm_srli_si128(a, 8), short());
  6816. a = add(a, _mm_srli_si128(a, 4), short());
  6817. a = add(a, _mm_srli_si128(a, 2), short());
  6818. return _mm_cvtsi128_si32(a);
  6819. }
  6820. Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
  6821. Vc_INTRINSIC schar add(__m128i a, schar) {
  6822. a = add(a, _mm_srli_si128(a, 8), schar());
  6823. a = add(a, _mm_srli_si128(a, 4), schar());
  6824. a = add(a, _mm_srli_si128(a, 2), schar());
  6825. a = add(a, _mm_srli_si128(a, 1), schar());
  6826. return _mm_cvtsi128_si32(a);
  6827. }
  6828. Vc_INTRINSIC uchar add(__m128i a, uchar) { return add(a, schar()); }
  6829. Vc_INTRINSIC float mul(__m128 a, float) {
  6830. a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
  6831. a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  6832. return _mm_cvtss_f32(a);
  6833. }
  6834. Vc_INTRINSIC double mul(__m128d a, double) {
  6835. a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
  6836. return _mm_cvtsd_f64(a);
  6837. }
  6838. Vc_INTRINSIC int mul(__m128i a, int) {
  6839. a = mul(a, _mm_srli_si128(a, 8), int());
  6840. a = mul(a, _mm_srli_si128(a, 4), int());
  6841. return _mm_cvtsi128_si32(a);
  6842. }
  6843. Vc_INTRINSIC uint mul(__m128i a, uint) { return mul(a, int()); }
  6844. Vc_INTRINSIC short mul(__m128i a, short) {
  6845. a = mul(a, _mm_srli_si128(a, 8), short());
  6846. a = mul(a, _mm_srli_si128(a, 4), short());
  6847. a = mul(a, _mm_srli_si128(a, 2), short());
  6848. return _mm_cvtsi128_si32(a);
  6849. }
  6850. Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
  6851. Vc_INTRINSIC schar mul(__m128i a, schar) {
  6852. const __m128i s0 = _mm_srai_epi16(a, 1);
  6853. const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
  6854. return mul(mul(s0, s1, short()), short());
  6855. }
  6856. Vc_INTRINSIC uchar mul(__m128i a, uchar) { return mul(a, schar()); }
  6857. Vc_INTRINSIC float min(__m128 a, float) {
  6858. a = _mm_min_ps(a, _mm_movehl_ps(a, a));
  6859. a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  6860. return _mm_cvtss_f32(a);
  6861. }
  6862. Vc_INTRINSIC double min(__m128d a, double) {
  6863. a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
  6864. return _mm_cvtsd_f64(a);
  6865. }
  6866. Vc_INTRINSIC int min(__m128i a, int) {
  6867. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
  6868. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
  6869. return _mm_cvtsi128_si32(a);
  6870. }
  6871. Vc_INTRINSIC uint min(__m128i a, uint) {
  6872. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
  6873. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
  6874. return _mm_cvtsi128_si32(a);
  6875. }
  6876. Vc_INTRINSIC short min(__m128i a, short) {
  6877. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
  6878. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
  6879. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
  6880. return _mm_cvtsi128_si32(a);
  6881. }
  6882. Vc_INTRINSIC ushort min(__m128i a, ushort) {
  6883. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
  6884. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
  6885. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
  6886. return _mm_cvtsi128_si32(a);
  6887. }
  6888. Vc_INTRINSIC schar min(__m128i a, schar) {
  6889. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6890. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6891. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
  6892. return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
  6893. }
  6894. Vc_INTRINSIC uchar min(__m128i a, uchar) {
  6895. a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6896. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6897. a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
  6898. return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
  6899. }
  6900. Vc_INTRINSIC float max(__m128 a, float) {
  6901. a = _mm_max_ps(a, _mm_movehl_ps(a, a));
  6902. a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
  6903. return _mm_cvtss_f32(a);
  6904. }
  6905. Vc_INTRINSIC double max(__m128d a, double) {
  6906. a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
  6907. return _mm_cvtsd_f64(a);
  6908. }
  6909. Vc_INTRINSIC int max(__m128i a, int) {
  6910. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
  6911. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
  6912. return _mm_cvtsi128_si32(a);
  6913. }
  6914. Vc_INTRINSIC uint max(__m128i a, uint) {
  6915. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
  6916. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
  6917. return _mm_cvtsi128_si32(a);
  6918. }
  6919. Vc_INTRINSIC short max(__m128i a, short) {
  6920. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
  6921. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
  6922. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
  6923. return _mm_cvtsi128_si32(a);
  6924. }
  6925. Vc_INTRINSIC ushort max(__m128i a, ushort) {
  6926. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
  6927. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
  6928. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
  6929. return _mm_cvtsi128_si32(a);
  6930. }
  6931. Vc_INTRINSIC schar max(__m128i a, schar) {
  6932. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6933. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6934. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
  6935. return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
  6936. }
  6937. Vc_INTRINSIC uchar max(__m128i a, uchar) {
  6938. a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6939. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
  6940. a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
  6941. return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
  6942. }
  6943. template <Vc::Implementation, typename T>
  6944. Vc_CONST_L SSE::Vector<T> sorted(SSE::Vector<T> x) Vc_CONST_R;
  6945. template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
  6946. {
  6947. static_assert(!CurrentImplementation::is(ScalarImpl),
  6948. "Detail::sorted can only be instantiated if a non-Scalar "
  6949. "implementation is selected.");
  6950. return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
  6951. ? SSE2Impl
  6952. : CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
  6953. ? SSE41Impl
  6954. : CurrentImplementation::current() > (x);
  6955. }
  6956. template <typename V> constexpr int sanitize(int n)
  6957. {
  6958. return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
  6959. }
  6960. template <typename T, size_t N, typename V>
  6961. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
  6962. {
  6963. using namespace SSE;
  6964. switch (static_cast<unsigned int>(amount) % N) {
  6965. case 0:
  6966. return v;
  6967. case 1:
  6968. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
  6969. case 2:
  6970. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
  6971. case 3:
  6972. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
  6973. case 4:
  6974. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
  6975. case 5:
  6976. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
  6977. case 6:
  6978. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
  6979. case 7:
  6980. return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
  6981. }
  6982. return sse_cast<V>(_mm_setzero_si128());
  6983. }
  6984. template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
  6985. template<typename V> struct InterleaveImpl<V, 8, 16> {
  6986. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  6987. const typename V::AsArg v0, const typename V::AsArg v1)
  6988. {
  6989. const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
  6990. const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
  6991. #ifdef __x86_64__
  6992. const long long tmp00 = _mm_cvtsi128_si64(tmp0);
  6993. const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
  6994. const long long tmp10 = _mm_cvtsi128_si64(tmp1);
  6995. const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
  6996. aliasing_cast<int>(data[i[0]]) = tmp00;
  6997. aliasing_cast<int>(data[i[1]]) = tmp00 >> 32;
  6998. aliasing_cast<int>(data[i[2]]) = tmp01;
  6999. aliasing_cast<int>(data[i[3]]) = tmp01 >> 32;
  7000. aliasing_cast<int>(data[i[4]]) = tmp10;
  7001. aliasing_cast<int>(data[i[5]]) = tmp10 >> 32;
  7002. aliasing_cast<int>(data[i[6]]) = tmp11;
  7003. aliasing_cast<int>(data[i[7]]) = tmp11 >> 32;
  7004. #elif defined(Vc_IMPL_SSE4_1)
  7005. using namespace SseIntrinsics;
  7006. aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
  7007. aliasing_cast<int>(data[i[1]]) = extract_epi32<1>(tmp0);
  7008. aliasing_cast<int>(data[i[2]]) = extract_epi32<2>(tmp0);
  7009. aliasing_cast<int>(data[i[3]]) = extract_epi32<3>(tmp0);
  7010. aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
  7011. aliasing_cast<int>(data[i[5]]) = extract_epi32<1>(tmp1);
  7012. aliasing_cast<int>(data[i[6]]) = extract_epi32<2>(tmp1);
  7013. aliasing_cast<int>(data[i[7]]) = extract_epi32<3>(tmp1);
  7014. #else
  7015. aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
  7016. aliasing_cast<int>(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
  7017. aliasing_cast<int>(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
  7018. aliasing_cast<int>(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
  7019. aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
  7020. aliasing_cast<int>(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
  7021. aliasing_cast<int>(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
  7022. aliasing_cast<int>(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
  7023. #endif
  7024. }
  7025. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
  7026. const typename V::AsArg v0, const typename V::AsArg v1)
  7027. {
  7028. const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
  7029. const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
  7030. V(tmp0).store(&data[i[0]], Vc::Unaligned);
  7031. V(tmp1).store(&data[i[4]], Vc::Unaligned);
  7032. }
  7033. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  7034. const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
  7035. {
  7036. #if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
  7037. const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
  7038. const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
  7039. const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
  7040. const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
  7041. const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
  7042. const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
  7043. const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
  7044. const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
  7045. const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
  7046. _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
  7047. _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
  7048. _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
  7049. _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
  7050. _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
  7051. _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
  7052. _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
  7053. _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
  7054. _mm_empty();
  7055. #else
  7056. interleave(data, i, v0, v1);
  7057. v2.scatter(data + 2, i);
  7058. #endif
  7059. }
  7060. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  7061. const typename V::AsArg v0, const typename V::AsArg v1,
  7062. const typename V::AsArg v2, const typename V::AsArg v3)
  7063. {
  7064. const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
  7065. const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
  7066. const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
  7067. const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
  7068. const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
  7069. const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
  7070. const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
  7071. const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
  7072. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
  7073. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
  7074. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
  7075. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
  7076. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
  7077. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
  7078. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
  7079. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
  7080. }
  7081. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
  7082. const typename V::AsArg v0, const typename V::AsArg v1,
  7083. const typename V::AsArg v2, const typename V::AsArg v3)
  7084. {
  7085. const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
  7086. const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
  7087. const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
  7088. const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
  7089. const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
  7090. const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
  7091. const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
  7092. const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
  7093. V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
  7094. V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
  7095. V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
  7096. V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
  7097. }
  7098. template <typename I>
  7099. static inline void interleave(typename V::EntryType *const data, const I &i,
  7100. const typename V::AsArg v0, const typename V::AsArg v1,
  7101. const typename V::AsArg v2, const typename V::AsArg v3,
  7102. const typename V::AsArg v4)
  7103. {
  7104. interleave(data, i, v0, v1, v2, v3);
  7105. v4.scatter(data + 4, i);
  7106. }
  7107. template <typename I>
  7108. static inline void interleave(typename V::EntryType *const data, const I &i,
  7109. const typename V::AsArg v0, const typename V::AsArg v1,
  7110. const typename V::AsArg v2, const typename V::AsArg v3,
  7111. const typename V::AsArg v4, const typename V::AsArg v5)
  7112. {
  7113. interleave(data, i, v0, v1, v2, v3);
  7114. interleave(data + 4, i, v4, v5);
  7115. }
  7116. template <typename I>
  7117. static inline void interleave(typename V::EntryType *const data, const I &i,
  7118. const typename V::AsArg v0, const typename V::AsArg v1,
  7119. const typename V::AsArg v2, const typename V::AsArg v3,
  7120. const typename V::AsArg v4, const typename V::AsArg v5,
  7121. const typename V::AsArg v6)
  7122. {
  7123. interleave(data, i, v0, v1, v2, v3);
  7124. interleave(data + 4, i, v4, v5, v6);
  7125. }
  7126. template <typename I>
  7127. static inline void interleave(typename V::EntryType *const data, const I &i,
  7128. const typename V::AsArg v0, const typename V::AsArg v1,
  7129. const typename V::AsArg v2, const typename V::AsArg v3,
  7130. const typename V::AsArg v4, const typename V::AsArg v5,
  7131. const typename V::AsArg v6, const typename V::AsArg v7)
  7132. {
  7133. interleave(data, i, v0, v1, v2, v3);
  7134. interleave(data + 4, i, v4, v5, v6, v7);
  7135. }
  7136. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7137. const I &i, V &v0, V &v1)
  7138. {
  7139. const __m128i a = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[0]]));
  7140. const __m128i b = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[1]]));
  7141. const __m128i c = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[2]]));
  7142. const __m128i d = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[3]]));
  7143. const __m128i e = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[4]]));
  7144. const __m128i f = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[5]]));
  7145. const __m128i g = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[6]]));
  7146. const __m128i h = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[7]]));
  7147. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7148. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7149. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7150. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7151. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7152. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7153. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7154. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7155. }
  7156. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7157. const I &i, V &v0, V &v1, V &v2)
  7158. {
  7159. const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7160. const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7161. const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7162. const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7163. const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7164. const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7165. const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7166. const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7167. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7168. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7169. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7170. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7171. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7172. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7173. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7174. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7175. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7176. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7177. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7178. }
  7179. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7180. const I &i, V &v0, V &v1, V &v2, V &v3)
  7181. {
  7182. const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7183. const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7184. const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7185. const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7186. const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7187. const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7188. const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7189. const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7190. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7191. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7192. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7193. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7194. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7195. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7196. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7197. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7198. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7199. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7200. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7201. v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
  7202. }
  7203. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7204. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  7205. {
  7206. const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7207. const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7208. const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7209. const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7210. const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7211. const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7212. const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7213. const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7214. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7215. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7216. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7217. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7218. const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
  7219. const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
  7220. const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
  7221. const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
  7222. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7223. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7224. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7225. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7226. const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
  7227. const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
  7228. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7229. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7230. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7231. v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
  7232. v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
  7233. }
  7234. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7235. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  7236. {
  7237. const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7238. const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7239. const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7240. const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7241. const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7242. const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7243. const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7244. const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7245. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7246. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7247. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7248. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7249. const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
  7250. const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
  7251. const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
  7252. const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
  7253. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7254. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7255. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7256. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7257. const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
  7258. const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
  7259. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7260. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7261. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7262. v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
  7263. v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
  7264. v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
  7265. }
  7266. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7267. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  7268. {
  7269. const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7270. const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7271. const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7272. const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7273. const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7274. const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7275. const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7276. const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7277. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7278. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7279. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7280. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7281. const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
  7282. const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
  7283. const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
  7284. const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
  7285. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7286. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7287. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7288. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7289. const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
  7290. const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
  7291. const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
  7292. const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
  7293. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7294. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7295. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7296. v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
  7297. v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
  7298. v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
  7299. v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
  7300. }
  7301. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7302. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  7303. {
  7304. const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
  7305. const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
  7306. const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
  7307. const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
  7308. const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
  7309. const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
  7310. const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
  7311. const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
  7312. const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
  7313. const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
  7314. const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
  7315. const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
  7316. const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
  7317. const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
  7318. const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
  7319. const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
  7320. const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
  7321. const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
  7322. const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
  7323. const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
  7324. const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
  7325. const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
  7326. const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
  7327. const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
  7328. v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
  7329. v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
  7330. v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
  7331. v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
  7332. v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
  7333. v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
  7334. v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
  7335. v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
  7336. }
  7337. };
  7338. template<typename V> struct InterleaveImpl<V, 4, 16> {
  7339. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
  7340. const typename V::AsArg v0, const typename V::AsArg v1)
  7341. {
  7342. const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7343. const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7344. _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), tmp0);
  7345. _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), tmp1);
  7346. }
  7347. template <typename I>
  7348. static inline void interleave(typename V::EntryType *const data, const I &i,
  7349. const typename V::AsArg v0, const typename V::AsArg v1)
  7350. {
  7351. const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7352. const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7353. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
  7354. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
  7355. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
  7356. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
  7357. }
  7358. template <typename I>
  7359. static inline void interleave(typename V::EntryType *const data, const I &i,
  7360. const typename V::AsArg v0, const typename V::AsArg v1,
  7361. const typename V::AsArg v2)
  7362. {
  7363. #ifdef Vc_USE_MASKMOV_SCATTER
  7364. const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
  7365. const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
  7366. const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
  7367. const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
  7368. const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
  7369. _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
  7370. _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
  7371. _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
  7372. _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
  7373. #else
  7374. const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7375. const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7376. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
  7377. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
  7378. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
  7379. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
  7380. v2.scatter(data + 2, i);
  7381. #endif
  7382. }
  7383. template <typename I>
  7384. static inline void interleave(typename V::EntryType *const data, const I &i,
  7385. const typename V::AsArg v0, const typename V::AsArg v1,
  7386. const typename V::AsArg v2, const typename V::AsArg v3)
  7387. {
  7388. const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7389. const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
  7390. const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
  7391. const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
  7392. _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
  7393. _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
  7394. _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
  7395. _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
  7396. }
  7397. template <typename I>
  7398. static inline void interleave(typename V::EntryType *const data, const I &i,
  7399. const typename V::AsArg v0, const typename V::AsArg v1,
  7400. const typename V::AsArg v2, const typename V::AsArg v3,
  7401. const typename V::AsArg v4)
  7402. {
  7403. interleave(data, i, v0, v1, v2, v3);
  7404. v4.scatter(data + 4, i);
  7405. }
  7406. template <typename I>
  7407. static inline void interleave(typename V::EntryType *const data, const I &i,
  7408. const typename V::AsArg v0, const typename V::AsArg v1,
  7409. const typename V::AsArg v2, const typename V::AsArg v3,
  7410. const typename V::AsArg v4, const typename V::AsArg v5)
  7411. {
  7412. interleave(data, i, v0, v1, v2, v3);
  7413. interleave(data + 4, i, v4, v5);
  7414. }
  7415. template <typename I>
  7416. static inline void interleave(typename V::EntryType *const data, const I &i,
  7417. const typename V::AsArg v0, const typename V::AsArg v1,
  7418. const typename V::AsArg v2, const typename V::AsArg v3,
  7419. const typename V::AsArg v4, const typename V::AsArg v5,
  7420. const typename V::AsArg v6)
  7421. {
  7422. interleave(data, i, v0, v1, v2, v3);
  7423. interleave(data + 4, i, v4, v5, v6);
  7424. }
  7425. template <typename I>
  7426. static inline void interleave(typename V::EntryType *const data, const I &i,
  7427. const typename V::AsArg v0, const typename V::AsArg v1,
  7428. const typename V::AsArg v2, const typename V::AsArg v3,
  7429. const typename V::AsArg v4, const typename V::AsArg v5,
  7430. const typename V::AsArg v6, const typename V::AsArg v7)
  7431. {
  7432. interleave(data, i, v0, v1, v2, v3);
  7433. interleave(data + 4, i, v4, v5, v6, v7);
  7434. }
  7435. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7436. const I &i, V &v0, V &v1)
  7437. {
  7438. const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[0]])));
  7439. const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[1]])));
  7440. const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[2]])));
  7441. const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[3]])));
  7442. const __m128 tmp0 = _mm_unpacklo_ps(a, b);
  7443. const __m128 tmp1 = _mm_unpacklo_ps(c, d);
  7444. v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
  7445. v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
  7446. }
  7447. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7448. const I &i, V &v0, V &v1, V &v2)
  7449. {
  7450. const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  7451. const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
  7452. const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
  7453. const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
  7454. const __m128 tmp0 = _mm_unpacklo_ps(a, b);
  7455. const __m128 tmp1 = _mm_unpacklo_ps(c, d);
  7456. const __m128 tmp2 = _mm_unpackhi_ps(a, b);
  7457. const __m128 tmp3 = _mm_unpackhi_ps(c, d);
  7458. v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
  7459. v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
  7460. v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
  7461. }
  7462. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7463. const I &i, V &v0, V &v1, V &v2, V &v3)
  7464. {
  7465. const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  7466. const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
  7467. const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
  7468. const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
  7469. const __m128 tmp0 = _mm_unpacklo_ps(a, b);
  7470. const __m128 tmp1 = _mm_unpacklo_ps(c, d);
  7471. const __m128 tmp2 = _mm_unpackhi_ps(a, b);
  7472. const __m128 tmp3 = _mm_unpackhi_ps(c, d);
  7473. v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
  7474. v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
  7475. v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
  7476. v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
  7477. }
  7478. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7479. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  7480. {
  7481. deinterleave(data, i, v0, v1, v2, v3);
  7482. v4.gather(data + 4, i);
  7483. }
  7484. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7485. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  7486. {
  7487. deinterleave(data, i, v0, v1, v2, v3);
  7488. deinterleave(data + 4, i, v4, v5);
  7489. }
  7490. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7491. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  7492. {
  7493. deinterleave(data, i, v0, v1, v2, v3);
  7494. deinterleave(data + 4, i, v4, v5, v6);
  7495. }
  7496. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7497. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  7498. {
  7499. deinterleave(data, i, v0, v1, v2, v3);
  7500. deinterleave(data + 4, i, v4, v5, v6, v7);
  7501. }
  7502. };
  7503. template<typename V> struct InterleaveImpl<V, 2, 16> {
  7504. template <typename I>
  7505. static inline void interleave(typename V::EntryType *const data, const I &i,
  7506. const typename V::AsArg v0, const typename V::AsArg v1)
  7507. {
  7508. const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
  7509. const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
  7510. _mm_storeu_pd(&data[i[0]], tmp0);
  7511. _mm_storeu_pd(&data[i[1]], tmp1);
  7512. }
  7513. template <typename I>
  7514. static inline void interleave(typename V::EntryType *const data, const I &i,
  7515. const typename V::AsArg v0, const typename V::AsArg v1,
  7516. const typename V::AsArg v2)
  7517. {
  7518. interleave(data, i, v0, v1);
  7519. v2.scatter(data + 2, i);
  7520. }
  7521. template <typename I>
  7522. static inline void interleave(typename V::EntryType *const data, const I &i,
  7523. const typename V::AsArg v0, const typename V::AsArg v1,
  7524. const typename V::AsArg v2, const typename V::AsArg v3)
  7525. {
  7526. interleave(data, i, v0, v1);
  7527. interleave(data + 2, i, v2, v3);
  7528. }
  7529. template <typename I>
  7530. static inline void interleave(typename V::EntryType *const data, const I &i,
  7531. const typename V::AsArg v0, const typename V::AsArg v1,
  7532. const typename V::AsArg v2, const typename V::AsArg v3,
  7533. const typename V::AsArg v4)
  7534. {
  7535. interleave(data, i, v0, v1, v2, v3);
  7536. v4.scatter(data + 4, i);
  7537. }
  7538. template <typename I>
  7539. static inline void interleave(typename V::EntryType *const data, const I &i,
  7540. const typename V::AsArg v0, const typename V::AsArg v1,
  7541. const typename V::AsArg v2, const typename V::AsArg v3,
  7542. const typename V::AsArg v4, const typename V::AsArg v5)
  7543. {
  7544. interleave(data, i, v0, v1, v2, v3);
  7545. interleave(data + 4, i, v4, v5);
  7546. }
  7547. template <typename I>
  7548. static inline void interleave(typename V::EntryType *const data, const I &i,
  7549. const typename V::AsArg v0, const typename V::AsArg v1,
  7550. const typename V::AsArg v2, const typename V::AsArg v3,
  7551. const typename V::AsArg v4, const typename V::AsArg v5,
  7552. const typename V::AsArg v6)
  7553. {
  7554. interleave(data, i, v0, v1, v2, v3);
  7555. interleave(data + 4, i, v4, v5, v6);
  7556. }
  7557. template <typename I>
  7558. static inline void interleave(typename V::EntryType *const data, const I &i,
  7559. const typename V::AsArg v0, const typename V::AsArg v1,
  7560. const typename V::AsArg v2, const typename V::AsArg v3,
  7561. const typename V::AsArg v4, const typename V::AsArg v5,
  7562. const typename V::AsArg v6, const typename V::AsArg v7)
  7563. {
  7564. interleave(data, i, v0, v1, v2, v3);
  7565. interleave(data + 4, i, v4, v5, v6, v7);
  7566. }
  7567. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7568. const I &i, V &v0, V &v1)
  7569. {
  7570. const __m128d a = _mm_loadu_pd(&data[i[0]]);
  7571. const __m128d b = _mm_loadu_pd(&data[i[1]]);
  7572. v0.data() = _mm_unpacklo_pd(a, b);
  7573. v1.data() = _mm_unpackhi_pd(a, b);
  7574. }
  7575. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7576. const I &i, V &v0, V &v1, V &v2)
  7577. {
  7578. v2.gather(data + 2, i);
  7579. deinterleave(data, i, v0, v1);
  7580. }
  7581. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7582. const I &i, V &v0, V &v1, V &v2, V &v3)
  7583. {
  7584. deinterleave(data, i, v0, v1);
  7585. deinterleave(data + 2, i, v2, v3);
  7586. }
  7587. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7588. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  7589. {
  7590. deinterleave(data, i, v0, v1);
  7591. deinterleave(data + 2, i, v2, v3);
  7592. v4.gather(data + 4, i);
  7593. }
  7594. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7595. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  7596. {
  7597. deinterleave(data, i, v0, v1);
  7598. deinterleave(data + 2, i, v2, v3);
  7599. deinterleave(data + 4, i, v4, v5);
  7600. }
  7601. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7602. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  7603. {
  7604. deinterleave(data, i, v0, v1);
  7605. deinterleave(data + 2, i, v2, v3);
  7606. deinterleave(data + 4, i, v4, v5);
  7607. v6.gather(data + 6, i);
  7608. }
  7609. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  7610. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  7611. {
  7612. deinterleave(data, i, v0, v1);
  7613. deinterleave(data + 2, i, v2, v3);
  7614. deinterleave(data + 4, i, v4, v5);
  7615. deinterleave(data + 6, i, v6, v7);
  7616. }
  7617. };
  7618. }
  7619. }
  7620. #endif
  7621. namespace Vc_VERSIONED_NAMESPACE
  7622. {
  7623. namespace Detail
  7624. {
  7625. template <size_t Size>
  7626. Vc_INTRINSIC_L Vc_CONST_L int mask_count(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
  7627. template <size_t Size>
  7628. Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
  7629. template <size_t Size>
  7630. Vc_INTRINSIC_L Vc_CONST_L bool is_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
  7631. template <size_t Size>
  7632. Vc_INTRINSIC_L Vc_CONST_L bool is_not_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
  7633. }
  7634. using SSE::sse_cast;
  7635. template <typename T> class Mask<T, VectorAbi::Sse>
  7636. {
  7637. using abi = VectorAbi::Sse;
  7638. friend class Mask< double, abi>;
  7639. friend class Mask< float, abi>;
  7640. friend class Mask< int32_t, abi>;
  7641. friend class Mask<uint32_t, abi>;
  7642. friend class Mask< int16_t, abi>;
  7643. friend class Mask<uint16_t, abi>;
  7644. typedef Common::MaskBool<sizeof(T)> MaskBool;
  7645. typedef Common::Storage<T, SSE::VectorTraits<T>::Size> Storage;
  7646. public:
  7647. typedef bool EntryType;
  7648. using value_type = EntryType;
  7649. using EntryReference = Detail::ElementReference<Mask>;
  7650. using reference = EntryReference;
  7651. typedef MaskBool VectorEntryType;
  7652. using VectorType = typename Storage::VectorType;
  7653. using Vector = SSE::Vector<T>;
  7654. public:
  7655. Vc_FREE_STORE_OPERATORS_ALIGNED(16);
  7656. static constexpr size_t Size = SSE::VectorTraits<T>::Size;
  7657. static constexpr size_t MemoryAlignment = Size;
  7658. static constexpr std::size_t size() { return Size; }
  7659. #if defined Vc_MSVC && defined _WIN32
  7660. typedef const Mask &Argument;
  7661. #else
  7662. typedef Mask Argument;
  7663. #endif
  7664. Vc_INTRINSIC Mask() = default;
  7665. Vc_INTRINSIC Mask(const Mask &) = default;
  7666. Vc_INTRINSIC Mask &operator=(const Mask &) = default;
  7667. Vc_INTRINSIC Mask(const __m128 &x) : d(sse_cast<VectorType>(x)) {}
  7668. Vc_INTRINSIC Mask(const __m128d &x) : d(sse_cast<VectorType>(x)) {}
  7669. Vc_INTRINSIC Mask(const __m128i &x) : d(sse_cast<VectorType>(x)) {}
  7670. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : Mask(_mm_setzero_ps()) {}
  7671. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : Mask(SSE::_mm_setallone_ps()) {}
  7672. Vc_INTRINSIC explicit Mask(bool b) : Mask(b ? SSE::_mm_setallone_ps() : _mm_setzero_ps()) {}
  7673. Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
  7674. Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
  7675. template <typename U>
  7676. Vc_INTRINSIC Mask(
  7677. U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
  7678. : d(sse_cast<VectorType>(
  7679. Detail::mask_cast<Traits::simd_vector_size<U>::value, Size, __m128>(
  7680. rhs.dataI())))
  7681. {
  7682. }
  7683. #if Vc_IS_VERSION_1
  7684. template <typename U>
  7685. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  7686. "mask types") Vc_INTRINSIC
  7687. explicit Mask(U &&rhs,
  7688. Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
  7689. #endif
  7690. Vc_ALWAYS_INLINE explicit Mask(const bool *mem) { load(mem); }
  7691. template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags f) { load(mem, f); }
  7692. Vc_ALWAYS_INLINE_L void load(const bool *mem) Vc_ALWAYS_INLINE_R;
  7693. template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { load(mem); }
  7694. Vc_ALWAYS_INLINE_L void store(bool *) const Vc_ALWAYS_INLINE_R;
  7695. template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { store(mem); }
  7696. Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const
  7697. {
  7698. return Detail::is_equal<Size>(dataF(), rhs.dataF());
  7699. }
  7700. Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const
  7701. {
  7702. return Detail::is_not_equal<Size>(dataF(), rhs.dataF());
  7703. }
  7704. Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const
  7705. {
  7706. #ifdef Vc_GCC
  7707. return ~dataI();
  7708. #else
  7709. return _mm_andnot_si128(dataI(), SSE::_mm_setallone_si128());
  7710. #endif
  7711. }
  7712. Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_and_ps(dataF(), rhs.dataF())); return *this; }
  7713. Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_or_ps (dataF(), rhs.dataF())); return *this; }
  7714. Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_xor_ps(dataF(), rhs.dataF())); return *this; }
  7715. Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
  7716. Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
  7717. Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &rhs) const { return _mm_xor_ps(dataF(), rhs.dataF()); }
  7718. Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
  7719. Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
  7720. Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return
  7721. #ifdef Vc_USE_PTEST
  7722. _mm_testc_si128(dataI(), SSE::_mm_setallone_si128());
  7723. #else
  7724. _mm_movemask_epi8(dataI()) == 0xffff;
  7725. #endif
  7726. }
  7727. Vc_ALWAYS_INLINE Vc_PURE bool isNotEmpty() const { return
  7728. #ifdef Vc_USE_PTEST
  7729. 0 == _mm_testz_si128(dataI(), dataI());
  7730. #else
  7731. _mm_movemask_epi8(dataI()) != 0x0000;
  7732. #endif
  7733. }
  7734. Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return
  7735. #ifdef Vc_USE_PTEST
  7736. 0 != _mm_testz_si128(dataI(), dataI());
  7737. #else
  7738. _mm_movemask_epi8(dataI()) == 0x0000;
  7739. #endif
  7740. }
  7741. Vc_ALWAYS_INLINE Vc_PURE bool isMix() const {
  7742. #ifdef Vc_USE_PTEST
  7743. return _mm_test_mix_ones_zeros(dataI(), SSE::_mm_setallone_si128());
  7744. #else
  7745. const int tmp = _mm_movemask_epi8(dataI());
  7746. return tmp != 0 && (tmp ^ 0xffff) != 0;
  7747. #endif
  7748. }
  7749. Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return _mm_movemask_epi8(dataI()); }
  7750. Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
  7751. Vc_ALWAYS_INLINE Vc_PURE VectorType data() const { return d.v(); }
  7752. Vc_ALWAYS_INLINE Vc_PURE __m128 dataF() const { return SSE::sse_cast<__m128 >(d.v()); }
  7753. Vc_ALWAYS_INLINE Vc_PURE __m128i dataI() const { return SSE::sse_cast<__m128i>(d.v()); }
  7754. Vc_ALWAYS_INLINE Vc_PURE __m128d dataD() const { return SSE::sse_cast<__m128d>(d.v()); }
  7755. private:
  7756. friend reference;
  7757. static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
  7758. {
  7759. return MaskBool(m.d.m(i));
  7760. }
  7761. template <typename U>
  7762. static Vc_INTRINSIC void set(Mask &m, int i,
  7763. U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
  7764. {
  7765. m.d.set(i, MaskBool(std::forward<U>(v)));
  7766. }
  7767. public:
  7768. Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
  7769. {
  7770. return {*this, int(index)};
  7771. }
  7772. Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
  7773. {
  7774. return get(*this, index);
  7775. }
  7776. Vc_ALWAYS_INLINE Vc_PURE int count() const
  7777. {
  7778. return Detail::mask_count<Size>(dataI());
  7779. }
  7780. Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
  7781. template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
  7782. Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
  7783. private:
  7784. #ifdef Vc_COMPILE_BENCHMARKS
  7785. public:
  7786. #endif
  7787. Storage d;
  7788. };
  7789. template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::Size;
  7790. template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::MemoryAlignment;
  7791. }
  7792. namespace Vc_VERSIONED_NAMESPACE
  7793. {
  7794. namespace Detail
  7795. {
  7796. template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
  7797. {
  7798. int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
  7799. return (mask & 1) + (mask >> 1);
  7800. }
  7801. template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
  7802. {
  7803. #ifdef Vc_IMPL_POPCNT
  7804. return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
  7805. #else
  7806. auto x = _mm_srli_epi32(k, 31);
  7807. x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
  7808. x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
  7809. return _mm_cvtsi128_si32(x);
  7810. #endif
  7811. }
  7812. template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
  7813. {
  7814. #ifdef Vc_IMPL_POPCNT
  7815. return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
  7816. #else
  7817. auto x = _mm_srli_epi16(k, 15);
  7818. x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
  7819. x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
  7820. x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
  7821. return _mm_extract_epi16(x, 0);
  7822. #endif
  7823. }
  7824. template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
  7825. {
  7826. return Detail::popcnt16(_mm_movemask_epi8(k));
  7827. }
  7828. template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
  7829. {
  7830. return _mm_movemask_pd(_mm_castsi128_pd(k));
  7831. }
  7832. template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
  7833. {
  7834. return _mm_movemask_ps(_mm_castsi128_ps(k));
  7835. }
  7836. template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
  7837. {
  7838. return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
  7839. }
  7840. template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
  7841. {
  7842. return _mm_movemask_epi8(k);
  7843. }
  7844. template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
  7845. template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
  7846. {
  7847. _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
  7848. }
  7849. template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
  7850. {
  7851. k = _mm_srli_epi16(k, 15);
  7852. const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
  7853. #ifdef __x86_64__
  7854. *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
  7855. #else
  7856. _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
  7857. #endif
  7858. }
  7859. template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
  7860. {
  7861. *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
  7862. _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
  7863. _mm_setzero_si128()));
  7864. }
  7865. template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
  7866. {
  7867. mem[0] = -SseIntrinsics::extract_epi32<1>(k);
  7868. mem[1] = -SseIntrinsics::extract_epi32<3>(k);
  7869. }
  7870. template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
  7871. template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
  7872. {
  7873. return sse_cast<__m128>(_mm_cmpgt_epi8(
  7874. _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
  7875. }
  7876. template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
  7877. {
  7878. #ifdef __x86_64__
  7879. __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
  7880. #else
  7881. __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
  7882. #endif
  7883. return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
  7884. }
  7885. template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
  7886. {
  7887. __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
  7888. k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
  7889. return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
  7890. }
  7891. template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
  7892. {
  7893. return sse_cast<__m128>(
  7894. _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
  7895. }
  7896. template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
  7897. {
  7898. return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
  7899. }
  7900. template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
  7901. {
  7902. return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
  7903. }
  7904. template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
  7905. {
  7906. return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
  7907. }
  7908. template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
  7909. {
  7910. return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
  7911. }
  7912. template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
  7913. {
  7914. return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
  7915. _mm_movemask_epi8(_mm_castps_si128(k2));
  7916. }
  7917. template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
  7918. {
  7919. return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
  7920. _mm_movemask_epi8(_mm_castps_si128(k2));
  7921. }
  7922. template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
  7923. {
  7924. return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
  7925. _mm_movemask_epi8(_mm_castps_si128(k2));
  7926. }
  7927. template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
  7928. {
  7929. return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
  7930. _mm_movemask_epi8(_mm_castps_si128(k2));
  7931. }
  7932. }
  7933. template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
  7934. {
  7935. *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
  7936. }
  7937. template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
  7938. {
  7939. Detail::mask_store<Size>(dataI(), mem);
  7940. }
  7941. template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
  7942. {
  7943. d.set(0, MaskBool(mem[0]));
  7944. d.set(1, MaskBool(mem[1]));
  7945. }
  7946. template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
  7947. {
  7948. d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
  7949. }
  7950. template <>
  7951. Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
  7952. {
  7953. return m.shiftMask() & (1 << 2 * index);
  7954. }
  7955. template <>
  7956. Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
  7957. {
  7958. return m.shiftMask() & (1 << 2 * index);
  7959. }
  7960. template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
  7961. {
  7962. const int mask = toInt();
  7963. #ifdef _MSC_VER
  7964. unsigned long bit;
  7965. _BitScanForward(&bit, mask);
  7966. #else
  7967. int bit;
  7968. __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
  7969. #endif
  7970. return bit;
  7971. }
  7972. template <typename M, typename G>
  7973. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
  7974. {
  7975. return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
  7976. gen(0) ? 0xffffffffffffffffull : 0);
  7977. }
  7978. template <typename M, typename G>
  7979. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
  7980. {
  7981. return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
  7982. gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
  7983. }
  7984. template <typename M, typename G>
  7985. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
  7986. {
  7987. return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
  7988. gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
  7989. gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
  7990. gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
  7991. }
  7992. template <typename T>
  7993. template <typename G>
  7994. Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
  7995. {
  7996. return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
  7997. std::integral_constant<int, Size>());
  7998. }
  7999. template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
  8000. {
  8001. switch (amount * int(sizeof(VectorEntryType))) {
  8002. case 0: return *this;
  8003. case 1: return Detail::shifted< 1>(dataI());
  8004. case 2: return Detail::shifted< 2>(dataI());
  8005. case 3: return Detail::shifted< 3>(dataI());
  8006. case 4: return Detail::shifted< 4>(dataI());
  8007. case 5: return Detail::shifted< 5>(dataI());
  8008. case 6: return Detail::shifted< 6>(dataI());
  8009. case 7: return Detail::shifted< 7>(dataI());
  8010. case 8: return Detail::shifted< 8>(dataI());
  8011. case 9: return Detail::shifted< 9>(dataI());
  8012. case 10: return Detail::shifted< 10>(dataI());
  8013. case 11: return Detail::shifted< 11>(dataI());
  8014. case 12: return Detail::shifted< 12>(dataI());
  8015. case 13: return Detail::shifted< 13>(dataI());
  8016. case 14: return Detail::shifted< 14>(dataI());
  8017. case 15: return Detail::shifted< 15>(dataI());
  8018. case 16: return Detail::shifted< 16>(dataI());
  8019. case -1: return Detail::shifted< -1>(dataI());
  8020. case -2: return Detail::shifted< -2>(dataI());
  8021. case -3: return Detail::shifted< -3>(dataI());
  8022. case -4: return Detail::shifted< -4>(dataI());
  8023. case -5: return Detail::shifted< -5>(dataI());
  8024. case -6: return Detail::shifted< -6>(dataI());
  8025. case -7: return Detail::shifted< -7>(dataI());
  8026. case -8: return Detail::shifted< -8>(dataI());
  8027. case -9: return Detail::shifted< -9>(dataI());
  8028. case -10: return Detail::shifted<-10>(dataI());
  8029. case -11: return Detail::shifted<-11>(dataI());
  8030. case -12: return Detail::shifted<-12>(dataI());
  8031. case -13: return Detail::shifted<-13>(dataI());
  8032. case -14: return Detail::shifted<-14>(dataI());
  8033. case -15: return Detail::shifted<-15>(dataI());
  8034. case -16: return Detail::shifted<-16>(dataI());
  8035. }
  8036. return Zero();
  8037. }
  8038. }
  8039. #endif
  8040. #include <algorithm>
  8041. #include <cmath>
  8042. #ifdef isfinite
  8043. #undef isfinite
  8044. #endif
  8045. #ifdef isnan
  8046. #undef isnan
  8047. #endif
  8048. namespace Vc_VERSIONED_NAMESPACE
  8049. {
  8050. #define Vc_CURRENT_CLASS_NAME Vector
  8051. template <typename T> class Vector<T, VectorAbi::Sse>
  8052. {
  8053. static_assert(std::is_arithmetic<T>::value,
  8054. "Vector<T> only accepts arithmetic builtin types as template parameter T.");
  8055. protected:
  8056. #ifdef Vc_COMPILE_BENCHMARKS
  8057. public:
  8058. #endif
  8059. typedef typename SSE::VectorTraits<T>::StorageType StorageType;
  8060. StorageType d;
  8061. typedef typename SSE::VectorTraits<T>::GatherMaskType GatherMask;
  8062. typedef SSE::VectorHelper<typename SSE::VectorTraits<T>::VectorType> HV;
  8063. typedef SSE::VectorHelper<T> HT;
  8064. public:
  8065. Vc_FREE_STORE_OPERATORS_ALIGNED(16);
  8066. typedef typename SSE::VectorTraits<T>::VectorType VectorType;
  8067. using vector_type = VectorType;
  8068. static constexpr size_t Size = SSE::VectorTraits<T>::Size;
  8069. static constexpr size_t MemoryAlignment = alignof(VectorType);
  8070. typedef typename SSE::VectorTraits<T>::EntryType EntryType;
  8071. using value_type = EntryType;
  8072. using VectorEntryType = EntryType;
  8073. using IndexType = fixed_size_simd<int, Size>;
  8074. typedef typename SSE::VectorTraits<T>::MaskType Mask;
  8075. using MaskType = Mask;
  8076. using mask_type = Mask;
  8077. typedef typename Mask::Argument MaskArg;
  8078. typedef typename Mask::Argument MaskArgument;
  8079. typedef const Vector AsArg;
  8080. using abi = VectorAbi::Sse;
  8081. using WriteMaskedVector = Common::WriteMaskedVector<Vector, Mask>;
  8082. template <typename U> using V = Vector<U, abi>;
  8083. using reference = Detail::ElementReference<Vector>;
  8084. public:
  8085. Vc_INTRINSIC Vector() = default;
  8086. static constexpr std::size_t size() { return Size; }
  8087. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
  8088. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
  8089. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
  8090. static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
  8091. static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
  8092. static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
  8093. {
  8094. return Vector(Vc::IndexesFromZero);
  8095. }
  8096. template <class G, int = 0,
  8097. class = typename std::enable_if<std::is_convertible<
  8098. decltype(std::declval<G>()(size_t())), value_type>::value>::type>
  8099. explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
  8100. {
  8101. }
  8102. static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
  8103. Vc_ALWAYS_INLINE Vector(VectorType x) : d(x) {}
  8104. template <typename U>
  8105. Vc_INTRINSIC Vector(
  8106. V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
  8107. void *>::type = nullptr)
  8108. : d(SSE::convert<U, T>(x.data()))
  8109. {
  8110. }
  8111. #if Vc_IS_VERSION_1
  8112. template <typename U>
  8113. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  8114. "vector types") Vc_INTRINSIC
  8115. explicit Vector(
  8116. V<U> x,
  8117. typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
  8118. void *>::type = nullptr)
  8119. : d(SSE::convert<U, T>(x.data()))
  8120. {
  8121. }
  8122. #endif
  8123. Vc_INTRINSIC Vector(EntryType a) : d(HT::set(a)) {}
  8124. template <typename U>
  8125. Vc_INTRINSIC Vector(U a,
  8126. typename std::enable_if<std::is_same<U, int>::value &&
  8127. !std::is_same<U, EntryType>::value,
  8128. void *>::type = nullptr)
  8129. : Vector(static_cast<EntryType>(a))
  8130. {
  8131. }
  8132. explicit Vc_INTRINSIC Vector(const EntryType *mem)
  8133. {
  8134. load(mem);
  8135. }
  8136. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  8137. explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
  8138. {
  8139. load(mem, flags);
  8140. }
  8141. template <typename U, typename Flags = DefaultLoadTag,
  8142. typename = enable_if<
  8143. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  8144. sizeof(EntryType) >= sizeof(U)) &&
  8145. std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  8146. explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
  8147. {
  8148. load<U, Flags>(x, flags);
  8149. }
  8150. Vc_INTRINSIC void load(const EntryType *mem)
  8151. {
  8152. load(mem, DefaultLoadTag());
  8153. }
  8154. template <typename Flags>
  8155. Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
  8156. load(const EntryType *mem, Flags flags)
  8157. {
  8158. load<EntryType, Flags>(mem, flags);
  8159. }
  8160. private:
  8161. template <typename U, typename Flags>
  8162. struct load_concept : public std::enable_if<
  8163. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  8164. sizeof(EntryType) >= sizeof(U)) &&
  8165. std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
  8166. {};
  8167. public:
  8168. template <typename U, typename Flags = DefaultLoadTag>
  8169. Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
  8170. template <
  8171. typename U,
  8172. typename Flags = DefaultStoreTag,
  8173. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  8174. Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
  8175. template <
  8176. typename U,
  8177. typename Flags = DefaultStoreTag,
  8178. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  8179. Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
  8180. Vc_INTRINSIC void store(EntryType *mem) const
  8181. {
  8182. store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
  8183. }
  8184. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  8185. Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
  8186. {
  8187. store<EntryType, Flags>(mem, flags);
  8188. }
  8189. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
  8190. {
  8191. store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
  8192. }
  8193. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  8194. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
  8195. {
  8196. store<EntryType, Flags>(mem, mask, flags);
  8197. }
  8198. Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
  8199. Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
  8200. Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
  8201. Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
  8202. Vc_INTRINSIC_L void setQnan(const Mask &k) Vc_INTRINSIC_R;
  8203. #ifndef Vc_CURRENT_CLASS_NAME
  8204. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  8205. #endif
  8206. private:
  8207. template <class MT, class IT, int Scale = 1>
  8208. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  8209. template <class MT, class IT, int Scale = 1>
  8210. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  8211. MaskArgument mask);
  8212. public:
  8213. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  8214. static_assert( \
  8215. std::is_convertible<MT, EntryType>::value, \
  8216. "The memory pointer needs to point to a type that can be converted to the " \
  8217. "EntryType of this SIMD vector type."); \
  8218. static_assert( \
  8219. Vc::Traits::has_subscript_operator<IT>::value, \
  8220. "The indexes argument must be a type that implements the subscript operator."); \
  8221. static_assert( \
  8222. !Traits::is_simd_vector<IT>::value || \
  8223. Traits::simd_vector_size<IT>::value >= Size, \
  8224. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  8225. "have at least as many entries as this SIMD vector."); \
  8226. static_assert( \
  8227. !std::is_array<T>::value || \
  8228. (std::rank<T>::value == 1 && \
  8229. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  8230. "If you use a simple array for the indexes parameter, the array must have " \
  8231. "at least as many entries as this SIMD vector.")
  8232. template <typename MT, typename IT,
  8233. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  8234. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  8235. {
  8236. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8237. gatherImplementation(
  8238. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  8239. }
  8240. template <class MT, class IT, int Scale>
  8241. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  8242. {
  8243. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8244. gatherImplementation(args);
  8245. }
  8246. template <typename MT, typename IT,
  8247. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  8248. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  8249. MaskArgument mask)
  8250. {
  8251. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8252. gatherImplementation(
  8253. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  8254. }
  8255. template <class MT, class IT, int Scale>
  8256. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  8257. MaskArgument mask)
  8258. {
  8259. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8260. gatherImplementation(args, mask);
  8261. }
  8262. template <typename MT, typename IT,
  8263. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  8264. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  8265. {
  8266. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8267. gatherImplementation(
  8268. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  8269. }
  8270. template <typename MT, typename IT,
  8271. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  8272. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  8273. {
  8274. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8275. gatherImplementation(
  8276. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  8277. }
  8278. template <class MT, class IT, int Scale>
  8279. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  8280. {
  8281. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8282. gatherImplementation(args);
  8283. }
  8284. template <class MT, class IT, int Scale>
  8285. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  8286. MaskArgument mask)
  8287. {
  8288. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  8289. gatherImplementation(args, mask);
  8290. }
  8291. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  8292. private:
  8293. template <typename MT, typename IT>
  8294. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  8295. template <typename MT, typename IT>
  8296. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  8297. public:
  8298. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  8299. static_assert( \
  8300. std::is_convertible<EntryType, MT>::value, \
  8301. "The memory pointer needs to point to a type that the EntryType of this " \
  8302. "SIMD vector type can be converted to."); \
  8303. static_assert( \
  8304. Vc::Traits::has_subscript_operator<IT>::value, \
  8305. "The indexes argument must be a type that implements the subscript operator."); \
  8306. static_assert( \
  8307. !Traits::is_simd_vector<IT>::value || \
  8308. Traits::simd_vector_size<IT>::value >= Size, \
  8309. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  8310. "have at least as many entries as this SIMD vector."); \
  8311. static_assert( \
  8312. !std::is_array<T>::value || \
  8313. (std::rank<T>::value == 1 && \
  8314. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  8315. "If you use a simple array for the indexes parameter, the array must have " \
  8316. "at least as many entries as this SIMD vector.")
  8317. template <typename MT,
  8318. typename IT,
  8319. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  8320. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  8321. {
  8322. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  8323. scatterImplementation(mem, std::forward<IT>(indexes));
  8324. }
  8325. template <typename MT,
  8326. typename IT,
  8327. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  8328. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  8329. {
  8330. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  8331. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  8332. }
  8333. template <typename MT, typename IT>
  8334. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  8335. {
  8336. scatter(args.address, args.indexes);
  8337. }
  8338. template <typename MT, typename IT>
  8339. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  8340. {
  8341. scatter(args.address, args.indexes, mask);
  8342. }
  8343. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  8344. #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
  8345. template <class U, class A, int Scale, int N = Vector<U, A>::size(),
  8346. class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
  8347. Vc_INTRINSIC void gatherImplementation(
  8348. const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
  8349. {
  8350. d.v() = SSE::gather<sizeof(T) * Scale>(
  8351. args.address, simd_cast<SSE::int_v>(args.indexes).data());
  8352. }
  8353. template <class U, class A, int Scale, int N = Vector<U, A>::size(),
  8354. class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
  8355. Vc_INTRINSIC void gatherImplementation(
  8356. const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
  8357. {
  8358. d.v() = SSE::gather<sizeof(T) * Scale>(
  8359. d.v(), k.data(), args.address,
  8360. simd_cast<SSE::int_v>(args.indexes).data());
  8361. }
  8362. template <
  8363. class MT, class U, class A, int Scale,
  8364. class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
  8365. (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
  8366. Vc_INTRINSIC void gatherImplementation(
  8367. const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
  8368. {
  8369. using AVX2::int_v;
  8370. const auto idx = simd_cast<int_v>(args.indexes).data();
  8371. *this = simd_cast<Vector>(int_v(
  8372. AVX::gather<sizeof(MT) * Scale>(aliasing_cast<int>(args.address), idx)));
  8373. if (sizeof(MT) == 1) {
  8374. if (std::is_signed<MT>::value) {
  8375. d.v() = _mm_srai_epi16(_mm_slli_epi16(d.v(), 8), 8);
  8376. } else {
  8377. *this &= 0xff;
  8378. }
  8379. }
  8380. }
  8381. template <
  8382. class MT, class U, class A, int Scale,
  8383. class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
  8384. (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
  8385. Vc_INTRINSIC void gatherImplementation(
  8386. const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
  8387. {
  8388. using AVX2::int_v;
  8389. auto v = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
  8390. _mm256_setzero_si256(), simd_cast<AVX2::int_m>(k).data(),
  8391. aliasing_cast<int>(args.address),
  8392. simd_cast<int_v>(args.indexes).data())));
  8393. if (sizeof(MT) == 1) {
  8394. if (std::is_signed<MT>::value) {
  8395. v.data() = _mm_srai_epi16(_mm_slli_epi16(v.data(), 8), 8);
  8396. } else {
  8397. v &= 0xff;
  8398. }
  8399. }
  8400. assign(v, k);
  8401. }
  8402. template <class MT, class U, class A, int Scale>
  8403. Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
  8404. Traits::is_valid_vector_argument<MT>::value &&
  8405. !std::is_same<MT, T>::value &&
  8406. Vector<U, A>::size() >= size()),
  8407. void>
  8408. gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
  8409. {
  8410. *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
  8411. }
  8412. template <class MT, class U, class A, int Scale>
  8413. Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
  8414. Traits::is_valid_vector_argument<MT>::value &&
  8415. !std::is_same<MT, T>::value &&
  8416. Vector<U, A>::size() >= size()),
  8417. void>
  8418. gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
  8419. MaskArgument k)
  8420. {
  8421. assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
  8422. }
  8423. #endif
  8424. Vc_INTRINSIC Vector &operator++() { data() = HT::add(data(), HT::one()); return *this; }
  8425. Vc_INTRINSIC Vector &operator--() { data() = HT::sub(data(), HT::one()); return *this; }
  8426. Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = HT::add(data(), HT::one()); return r; }
  8427. Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = HT::sub(data(), HT::one()); return r; }
  8428. private:
  8429. friend reference;
  8430. Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
  8431. {
  8432. return o.d.m(i);
  8433. }
  8434. template <typename U>
  8435. Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
  8436. noexcept(std::declval<value_type &>() = v))
  8437. {
  8438. o.d.set(i, v);
  8439. }
  8440. public:
  8441. Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
  8442. {
  8443. static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
  8444. return {*this, int(index)};
  8445. }
  8446. Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
  8447. {
  8448. return d.m(index);
  8449. }
  8450. Vc_INTRINSIC_L Vector Vc_VDECL operator[](const SSE::int_v &perm) const Vc_INTRINSIC_R;
  8451. Vc_INTRINSIC Vc_PURE Mask operator!() const
  8452. {
  8453. return *this == Zero();
  8454. }
  8455. Vc_INTRINSIC Vc_PURE Vector operator~() const
  8456. {
  8457. #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
  8458. static_assert(std::is_integral<T>::value,
  8459. "bit-complement can only be used with Vectors of integral type");
  8460. #endif
  8461. return Detail::andnot_(data(), HV::allone());
  8462. }
  8463. Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
  8464. Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
  8465. Vc_ALWAYS_INLINE Vector Vc_VDECL operator<< (AsArg shift) const { return generate([&](int i) { return get(*this, i) << get(shift, i); }); }
  8466. Vc_ALWAYS_INLINE Vector Vc_VDECL operator>> (AsArg shift) const { return generate([&](int i) { return get(*this, i) >> get(shift, i); }); }
  8467. Vc_ALWAYS_INLINE Vector &Vc_VDECL operator<<=(AsArg shift) { return *this = *this << shift; }
  8468. Vc_ALWAYS_INLINE Vector &Vc_VDECL operator>>=(AsArg shift) { return *this = *this >> shift; }
  8469. Vc_INTRINSIC_L Vector &Vc_VDECL operator<<=( int shift) Vc_INTRINSIC_R;
  8470. Vc_INTRINSIC_L Vector Vc_VDECL operator<< ( int shift) const Vc_INTRINSIC_R;
  8471. Vc_INTRINSIC_L Vector &Vc_VDECL operator>>=( int shift) Vc_INTRINSIC_R;
  8472. Vc_INTRINSIC_L Vector Vc_VDECL operator>> ( int shift) const Vc_INTRINSIC_R;
  8473. Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
  8474. isNegative() const
  8475. {
  8476. return Vc::isnegative(*this);
  8477. }
  8478. Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &mask)
  8479. {
  8480. data() = HV::blend(data(), v.data(), mask.data());
  8481. }
  8482. template <typename V2>
  8483. Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast")
  8484. Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const
  8485. {
  8486. return SSE::convert<T, typename V2::EntryType>(data());
  8487. }
  8488. template <typename V2>
  8489. Vc_DEPRECATED("use reinterpret_components_cast instead")
  8490. Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const
  8491. {
  8492. return SSE::sse_cast<typename V2::VectorType>(data());
  8493. }
  8494. Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return {*this, k}; }
  8495. Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); }
  8496. Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); }
  8497. template<int Index>
  8498. Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
  8499. Vc_INTRINSIC EntryType min() const { return HT::min(data()); }
  8500. Vc_INTRINSIC EntryType max() const { return HT::max(data()); }
  8501. Vc_INTRINSIC EntryType product() const { return HT::mul(data()); }
  8502. Vc_INTRINSIC EntryType sum() const { return HT::add(data()); }
  8503. Vc_INTRINSIC_L Vector partialSum() const Vc_INTRINSIC_R;
  8504. Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R;
  8505. Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R;
  8506. Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R;
  8507. Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R;
  8508. Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
  8509. Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
  8510. Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
  8511. Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
  8512. Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
  8513. template <typename F> void callWithValuesSorted(F &&f)
  8514. {
  8515. EntryType value = d.m(0);
  8516. f(value);
  8517. for (std::size_t i = 1; i < Size; ++i) {
  8518. if (d.m(i) != value) {
  8519. value = d.m(i);
  8520. f(value);
  8521. }
  8522. }
  8523. }
  8524. template <typename F> Vc_INTRINSIC void call(F &&f) const
  8525. {
  8526. Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
  8527. }
  8528. template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
  8529. {
  8530. for(size_t i : where(mask)) {
  8531. f(EntryType(d.m(i)));
  8532. }
  8533. }
  8534. template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
  8535. {
  8536. Vector r;
  8537. Common::for_all_vector_entries<Size>(
  8538. [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
  8539. return r;
  8540. }
  8541. template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
  8542. {
  8543. Vector r(*this);
  8544. for (size_t i : where(mask)) {
  8545. r.d.set(i, f(EntryType(r.d.m(i))));
  8546. }
  8547. return r;
  8548. }
  8549. template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
  8550. Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
  8551. }
  8552. Vc_INTRINSIC void fill(EntryType (&f)()) {
  8553. Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
  8554. }
  8555. template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
  8556. Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
  8557. copySign(AsArg x) const
  8558. {
  8559. return Vc::copysign(*this, x);
  8560. }
  8561. Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
  8562. {
  8563. return Vc::exponent(*this);
  8564. }
  8565. Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
  8566. Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
  8567. };
  8568. #undef Vc_CURRENT_CLASS_NAME
  8569. template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::Size;
  8570. template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::MemoryAlignment;
  8571. static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); }
  8572. static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); }
  8573. static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); }
  8574. static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); }
  8575. static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); }
  8576. static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); }
  8577. static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); }
  8578. static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); }
  8579. static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); }
  8580. static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); }
  8581. static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); }
  8582. static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); }
  8583. template <typename T,
  8584. typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
  8585. std::is_same<T, short>::value ||
  8586. std::is_same<T, int>::value>>
  8587. Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> abs(Vector<T, VectorAbi::Sse> x)
  8588. {
  8589. return SSE::VectorHelper<T>::abs(x.data());
  8590. }
  8591. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> sqrt (const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::sqrt(x.data()); }
  8592. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> rsqrt(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::rsqrt(x.data()); }
  8593. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> reciprocal(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::reciprocal(x.data()); }
  8594. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> round(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::round(x.data()); }
  8595. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isfinite(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isFinite(x.data()); }
  8596. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isinf(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isInfinite(x.data()); }
  8597. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isnan(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isNaN(x.data()); }
  8598. #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
  8599. template <Operator O, typename T, typename M, typename U> \
  8600. Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
  8601. Vector<T, VectorAbi::Sse> &lhs, M &&mask, U &&rhs) \
  8602. { \
  8603. lhs(mask) op_ rhs; \
  8604. } \
  8605. Vc_NOTHING_EXPECTING_SEMICOLON
  8606. Vc_CONDITIONAL_ASSIGN( Assign, =);
  8607. Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
  8608. Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
  8609. Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
  8610. Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
  8611. Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
  8612. Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
  8613. Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
  8614. Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
  8615. Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
  8616. Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
  8617. #undef Vc_CONDITIONAL_ASSIGN
  8618. #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
  8619. template <Operator O, typename T, typename M> \
  8620. Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Sse>> \
  8621. conditional_assign(Vector<T, VectorAbi::Sse> &lhs, M &&mask) \
  8622. { \
  8623. return expr_; \
  8624. } \
  8625. Vc_NOTHING_EXPECTING_SEMICOLON
  8626. Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
  8627. Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
  8628. Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
  8629. Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
  8630. #undef Vc_CONDITIONAL_ASSIGN
  8631. }
  8632. #ifndef VC_COMMON_X86_PREFETCHES_H_
  8633. #define VC_COMMON_X86_PREFETCHES_H_
  8634. #include <xmmintrin.h>
  8635. namespace Vc_VERSIONED_NAMESPACE
  8636. {
  8637. namespace Common
  8638. {
  8639. static constexpr int exclusive_hint = 0;
  8640. template <typename ExclusiveOrShared = Vc::Shared>
  8641. Vc_INTRINSIC void prefetchForOneRead(const void *addr)
  8642. {
  8643. if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
  8644. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
  8645. } else {
  8646. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
  8647. static_cast<decltype(_MM_HINT_NTA)>(_MM_HINT_NTA | exclusive_hint));
  8648. }
  8649. }
  8650. template <typename ExclusiveOrShared = Vc::Shared>
  8651. Vc_INTRINSIC void prefetchClose(const void *addr)
  8652. {
  8653. if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
  8654. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
  8655. } else {
  8656. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
  8657. static_cast<decltype(_MM_HINT_T0)>(_MM_HINT_T0 | exclusive_hint));
  8658. }
  8659. }
  8660. template <typename ExclusiveOrShared = Vc::Shared>
  8661. Vc_INTRINSIC void prefetchMid(const void *addr)
  8662. {
  8663. if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
  8664. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
  8665. } else {
  8666. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
  8667. static_cast<decltype(_MM_HINT_T1)>(_MM_HINT_T1 | exclusive_hint));
  8668. }
  8669. }
  8670. template <typename ExclusiveOrShared = Vc::Shared>
  8671. Vc_INTRINSIC void prefetchFar(const void *addr)
  8672. {
  8673. if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
  8674. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
  8675. } else {
  8676. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
  8677. static_cast<decltype(_MM_HINT_T2)>(_MM_HINT_T2 | exclusive_hint));
  8678. }
  8679. }
  8680. namespace
  8681. {
  8682. template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 != 0, void *>::type = nullptr)
  8683. {
  8684. const char *addr = static_cast<const char *>(addr_);
  8685. prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
  8686. prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
  8687. }
  8688. template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 == 0 && L2 != 0, void *>::type = nullptr)
  8689. {
  8690. const char *addr = static_cast<const char *>(addr_);
  8691. prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
  8692. }
  8693. template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 == 0, void *>::type = nullptr)
  8694. {
  8695. const char *addr = static_cast<const char *>(addr_);
  8696. prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
  8697. }
  8698. template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *, typename std::enable_if<L1 == 0 && L2 == 0, void *>::type = nullptr)
  8699. {
  8700. }
  8701. template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
  8702. template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
  8703. {
  8704. handlePrefetch<Flags::L1Stride, Flags::L2Stride, Flags::IsExclusivePrefetch>(addr);
  8705. }
  8706. template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
  8707. template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
  8708. {
  8709. handlePrefetch<Flags::L1Stride, Flags::L2Stride, !Flags::IsSharedPrefetch>(addr);
  8710. }
  8711. }
  8712. }
  8713. using Common::prefetchForOneRead;
  8714. using Common::prefetchClose;
  8715. using Common::prefetchMid;
  8716. using Common::prefetchFar;
  8717. }
  8718. #endif
  8719. #ifndef VC_SSE_LIMITS_H_
  8720. #define VC_SSE_LIMITS_H_
  8721. namespace std
  8722. {
  8723. template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits<unsigned short>
  8724. {
  8725. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
  8726. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8727. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest() Vc_NOEXCEPT { return min(); }
  8728. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8729. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8730. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8731. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8732. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8733. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
  8734. };
  8735. template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits<short>
  8736. {
  8737. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max() Vc_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); }
  8738. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi16(); }
  8739. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest() Vc_NOEXCEPT { return min(); }
  8740. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8741. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8742. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8743. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8744. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8745. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
  8746. };
  8747. template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits<unsigned int>
  8748. {
  8749. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
  8750. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8751. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest() Vc_NOEXCEPT { return min(); }
  8752. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8753. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8754. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8755. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8756. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8757. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
  8758. };
  8759. template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits<int>
  8760. {
  8761. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max() Vc_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); }
  8762. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi32(); }
  8763. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest() Vc_NOEXCEPT { return min(); }
  8764. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8765. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8766. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8767. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8768. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8769. static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
  8770. };
  8771. }
  8772. #endif
  8773. #ifndef VC_COMMON_BITSCANINTRINSICS_H_
  8774. #define VC_COMMON_BITSCANINTRINSICS_H_
  8775. #if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
  8776. # if Vc_GCC >= 0x40500
  8777. # include <x86intrin.h>
  8778. # else
  8779. #define _bit_scan_forward(x) __builtin_ctz(x)
  8780. static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
  8781. int r;
  8782. __asm__("bsr %1,%0" : "=r"(r) : "X"(x));
  8783. return r;
  8784. }
  8785. #define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
  8786. # endif
  8787. #elif defined(_WIN32)
  8788. #include <intrin.h>
  8789. static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
  8790. unsigned long index;
  8791. _BitScanForward(&index, x);
  8792. return index;
  8793. }
  8794. static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
  8795. unsigned long index;
  8796. _BitScanReverse(&index, x);
  8797. return index;
  8798. }
  8799. #elif defined(Vc_ICC)
  8800. #else
  8801. #endif
  8802. #endif
  8803. #ifndef VC_COMMON_SET_H_
  8804. #define VC_COMMON_SET_H_
  8805. namespace Vc_VERSIONED_NAMESPACE
  8806. {
  8807. namespace
  8808. {
  8809. static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
  8810. unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
  8811. {
  8812. #if defined(Vc_GNU_ASM)
  8813. #if 0
  8814. __m128i r;
  8815. unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
  8816. unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
  8817. asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
  8818. unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
  8819. unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
  8820. asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
  8821. return r;
  8822. #elif defined(Vc_USE_VEX_CODING)
  8823. __m128i r0, r1;
  8824. unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
  8825. unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
  8826. unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
  8827. unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
  8828. asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
  8829. asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
  8830. asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
  8831. asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
  8832. asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
  8833. return r0;
  8834. #else
  8835. __m128i r0, r1;
  8836. unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
  8837. unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
  8838. unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
  8839. unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
  8840. asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
  8841. asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
  8842. asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
  8843. asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
  8844. asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
  8845. return r0;
  8846. #endif
  8847. #else
  8848. unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
  8849. unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
  8850. unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
  8851. unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
  8852. return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
  8853. #endif
  8854. }
  8855. static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
  8856. {
  8857. return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
  8858. static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
  8859. static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
  8860. }
  8861. }
  8862. }
  8863. #endif
  8864. #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
  8865. #define VC_COMMON_GATHERIMPLEMENTATION_H_
  8866. namespace Vc_VERSIONED_NAMESPACE
  8867. {
  8868. namespace Common
  8869. {
  8870. enum class GatherScatterImplementation : int {
  8871. SimpleLoop,
  8872. SetIndexZero,
  8873. BitScanLoop,
  8874. PopcntSwitch
  8875. };
  8876. using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
  8877. using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
  8878. using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
  8879. using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
  8880. template <typename V, typename MT, typename IT>
  8881. Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
  8882. V &v,
  8883. const MT *mem,
  8884. IT &&indexes_,
  8885. typename V::MaskArgument mask)
  8886. {
  8887. auto indexes = std::forward<IT>(indexes_);
  8888. indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
  8889. const V tmp(mem, indexes);
  8890. where(mask) | v = tmp;
  8891. }
  8892. template <typename V, typename MT, typename IT>
  8893. Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
  8894. const typename V::MaskArgument mask)
  8895. {
  8896. if (Vc_IS_UNLIKELY(mask.isEmpty())) {
  8897. return;
  8898. }
  8899. #if defined Vc_GCC && Vc_GCC >= 0x40900
  8900. constexpr std::size_t Sizeof = sizeof(V);
  8901. using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
  8902. Builtin tmp = reinterpret_cast<Builtin>(v.data());
  8903. Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
  8904. if (mask[i]) {
  8905. tmp[i] = mem[indexes[i]];
  8906. }
  8907. });
  8908. v.data() = reinterpret_cast<typename V::VectorType>(tmp);
  8909. #else
  8910. Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
  8911. if (mask[i])
  8912. v[i] = mem[indexes[i]];
  8913. });
  8914. #endif
  8915. }
  8916. template <typename V, typename MT, typename IT>
  8917. Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
  8918. V &v,
  8919. const MT *mem,
  8920. const IT &indexes,
  8921. typename V::MaskArgument mask)
  8922. {
  8923. #ifdef Vc_GNU_ASM
  8924. size_t bits = mask.toInt();
  8925. while (Vc_IS_LIKELY(bits > 0)) {
  8926. size_t i, j;
  8927. asm("bsf %[bits],%[i]\n\t"
  8928. "bsr %[bits],%[j]\n\t"
  8929. "btr %[i],%[bits]\n\t"
  8930. "btr %[j],%[bits]\n\t"
  8931. : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
  8932. v[i] = mem[indexes[i]];
  8933. v[j] = mem[indexes[j]];
  8934. }
  8935. #else
  8936. int bits = mask.toInt();
  8937. while (bits) {
  8938. const int i = _bit_scan_forward(bits);
  8939. bits &= bits - 1;
  8940. v[i] = mem[indexes[i]];
  8941. }
  8942. #endif
  8943. }
  8944. template <typename V, typename MT, typename IT>
  8945. Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
  8946. V &v,
  8947. const MT *mem,
  8948. const IT &indexes,
  8949. typename V::MaskArgument mask,
  8950. enable_if<V::Size == 16> = nullarg)
  8951. {
  8952. unsigned int bits = mask.toInt();
  8953. unsigned int low, high = 0;
  8954. switch (Vc::Detail::popcnt16(bits)) {
  8955. case 16:
  8956. v.gather(mem, indexes);
  8957. break;
  8958. case 15:
  8959. low = _bit_scan_forward(bits);
  8960. bits ^= 1 << low;
  8961. v[low] = mem[indexes[low]];
  8962. case 14:
  8963. high = _bit_scan_reverse(bits);
  8964. v[high] = mem[indexes[high]];
  8965. high = (1 << high);
  8966. case 13:
  8967. low = _bit_scan_forward(bits);
  8968. bits ^= high | (1 << low);
  8969. v[low] = mem[indexes[low]];
  8970. case 12:
  8971. high = _bit_scan_reverse(bits);
  8972. v[high] = mem[indexes[high]];
  8973. high = (1 << high);
  8974. case 11:
  8975. low = _bit_scan_forward(bits);
  8976. bits ^= high | (1 << low);
  8977. v[low] = mem[indexes[low]];
  8978. case 10:
  8979. high = _bit_scan_reverse(bits);
  8980. v[high] = mem[indexes[high]];
  8981. high = (1 << high);
  8982. case 9:
  8983. low = _bit_scan_forward(bits);
  8984. bits ^= high | (1 << low);
  8985. v[low] = mem[indexes[low]];
  8986. case 8:
  8987. high = _bit_scan_reverse(bits);
  8988. v[high] = mem[indexes[high]];
  8989. high = (1 << high);
  8990. case 7:
  8991. low = _bit_scan_forward(bits);
  8992. bits ^= high | (1 << low);
  8993. v[low] = mem[indexes[low]];
  8994. case 6:
  8995. high = _bit_scan_reverse(bits);
  8996. v[high] = mem[indexes[high]];
  8997. high = (1 << high);
  8998. case 5:
  8999. low = _bit_scan_forward(bits);
  9000. bits ^= high | (1 << low);
  9001. v[low] = mem[indexes[low]];
  9002. case 4:
  9003. high = _bit_scan_reverse(bits);
  9004. v[high] = mem[indexes[high]];
  9005. high = (1 << high);
  9006. case 3:
  9007. low = _bit_scan_forward(bits);
  9008. bits ^= high | (1 << low);
  9009. v[low] = mem[indexes[low]];
  9010. case 2:
  9011. high = _bit_scan_reverse(bits);
  9012. v[high] = mem[indexes[high]];
  9013. case 1:
  9014. low = _bit_scan_forward(bits);
  9015. v[low] = mem[indexes[low]];
  9016. case 0:
  9017. break;
  9018. }
  9019. }
  9020. template <typename V, typename MT, typename IT>
  9021. Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
  9022. V &v,
  9023. const MT *mem,
  9024. const IT &indexes,
  9025. typename V::MaskArgument mask,
  9026. enable_if<V::Size == 8> = nullarg)
  9027. {
  9028. unsigned int bits = mask.toInt();
  9029. unsigned int low, high = 0;
  9030. switch (Vc::Detail::popcnt8(bits)) {
  9031. case 8:
  9032. v.gather(mem, indexes);
  9033. break;
  9034. case 7:
  9035. low = _bit_scan_forward(bits);
  9036. bits ^= 1 << low;
  9037. v[low] = mem[indexes[low]];
  9038. case 6:
  9039. high = _bit_scan_reverse(bits);
  9040. v[high] = mem[indexes[high]];
  9041. high = (1 << high);
  9042. case 5:
  9043. low = _bit_scan_forward(bits);
  9044. bits ^= high | (1 << low);
  9045. v[low] = mem[indexes[low]];
  9046. case 4:
  9047. high = _bit_scan_reverse(bits);
  9048. v[high] = mem[indexes[high]];
  9049. high = (1 << high);
  9050. case 3:
  9051. low = _bit_scan_forward(bits);
  9052. bits ^= high | (1 << low);
  9053. v[low] = mem[indexes[low]];
  9054. case 2:
  9055. high = _bit_scan_reverse(bits);
  9056. v[high] = mem[indexes[high]];
  9057. case 1:
  9058. low = _bit_scan_forward(bits);
  9059. v[low] = mem[indexes[low]];
  9060. case 0:
  9061. break;
  9062. }
  9063. }
  9064. template <typename V, typename MT, typename IT>
  9065. Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
  9066. V &v,
  9067. const MT *mem,
  9068. const IT &indexes,
  9069. typename V::MaskArgument mask,
  9070. enable_if<V::Size == 4> = nullarg)
  9071. {
  9072. unsigned int bits = mask.toInt();
  9073. unsigned int low, high = 0;
  9074. switch (Vc::Detail::popcnt4(bits)) {
  9075. case 4:
  9076. v.gather(mem, indexes);
  9077. break;
  9078. case 3:
  9079. low = _bit_scan_forward(bits);
  9080. bits ^= 1 << low;
  9081. v[low] = mem[indexes[low]];
  9082. case 2:
  9083. high = _bit_scan_reverse(bits);
  9084. v[high] = mem[indexes[high]];
  9085. case 1:
  9086. low = _bit_scan_forward(bits);
  9087. v[low] = mem[indexes[low]];
  9088. case 0:
  9089. break;
  9090. }
  9091. }
  9092. template <typename V, typename MT, typename IT>
  9093. Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
  9094. V &v,
  9095. const MT *mem,
  9096. const IT &indexes,
  9097. typename V::MaskArgument mask,
  9098. enable_if<V::Size == 2> = nullarg)
  9099. {
  9100. unsigned int bits = mask.toInt();
  9101. unsigned int low;
  9102. switch (Vc::Detail::popcnt4(bits)) {
  9103. case 2:
  9104. v.gather(mem, indexes);
  9105. break;
  9106. case 1:
  9107. low = _bit_scan_forward(bits);
  9108. v[low] = mem[indexes[low]];
  9109. case 0:
  9110. break;
  9111. }
  9112. }
  9113. }
  9114. }
  9115. #endif
  9116. #ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
  9117. #define VC_COMMON_SCATTERIMPLEMENTATION_H_
  9118. namespace Vc_VERSIONED_NAMESPACE
  9119. {
  9120. namespace Common
  9121. {
  9122. template <typename V, typename MT, typename IT>
  9123. Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
  9124. V &v,
  9125. MT *mem,
  9126. IT indexes,
  9127. typename V::MaskArgument mask)
  9128. {
  9129. indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
  9130. const V tmp(mem, indexes);
  9131. where(mask) | v = tmp;
  9132. }
  9133. template <typename V, typename MT, typename IT>
  9134. Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
  9135. V &v,
  9136. MT *mem,
  9137. const IT &indexes,
  9138. typename V::MaskArgument mask)
  9139. {
  9140. if (Vc_IS_UNLIKELY(mask.isEmpty())) {
  9141. return;
  9142. }
  9143. Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
  9144. if (mask[i])
  9145. mem[indexes[i]] = v[i];
  9146. });
  9147. }
  9148. template <typename V, typename MT, typename IT>
  9149. Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
  9150. V &v,
  9151. MT *mem,
  9152. const IT &indexes,
  9153. typename V::MaskArgument mask)
  9154. {
  9155. size_t bits = mask.toInt();
  9156. while (Vc_IS_LIKELY(bits > 0)) {
  9157. size_t i, j;
  9158. asm("bsf %[bits],%[i]\n\t"
  9159. "bsr %[bits],%[j]\n\t"
  9160. "btr %[i],%[bits]\n\t"
  9161. "btr %[j],%[bits]\n\t"
  9162. : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
  9163. mem[indexes[i]] = v[i];
  9164. mem[indexes[j]] = v[j];
  9165. }
  9166. }
  9167. template <typename V, typename MT, typename IT>
  9168. Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
  9169. V &v,
  9170. MT *mem,
  9171. const IT &indexes,
  9172. typename V::MaskArgument mask,
  9173. enable_if<V::Size == 16> = nullarg)
  9174. {
  9175. unsigned int bits = mask.toInt();
  9176. unsigned int low, high = 0;
  9177. switch (Vc::Detail::popcnt16(bits)) {
  9178. case 16:
  9179. v.scatter(mem, indexes);
  9180. break;
  9181. case 15:
  9182. low = _bit_scan_forward(bits);
  9183. bits ^= 1 << low;
  9184. mem[indexes[low]] = v[low];
  9185. case 14:
  9186. high = _bit_scan_reverse(bits);
  9187. mem[indexes[high]] = v[high];
  9188. high = (1 << high);
  9189. case 13:
  9190. low = _bit_scan_forward(bits);
  9191. bits ^= high | (1 << low);
  9192. mem[indexes[low]] = v[low];
  9193. case 12:
  9194. high = _bit_scan_reverse(bits);
  9195. mem[indexes[high]] = v[high];
  9196. high = (1 << high);
  9197. case 11:
  9198. low = _bit_scan_forward(bits);
  9199. bits ^= high | (1 << low);
  9200. mem[indexes[low]] = v[low];
  9201. case 10:
  9202. high = _bit_scan_reverse(bits);
  9203. mem[indexes[high]] = v[high];
  9204. high = (1 << high);
  9205. case 9:
  9206. low = _bit_scan_forward(bits);
  9207. bits ^= high | (1 << low);
  9208. mem[indexes[low]] = v[low];
  9209. case 8:
  9210. high = _bit_scan_reverse(bits);
  9211. mem[indexes[high]] = v[high];
  9212. high = (1 << high);
  9213. case 7:
  9214. low = _bit_scan_forward(bits);
  9215. bits ^= high | (1 << low);
  9216. mem[indexes[low]] = v[low];
  9217. case 6:
  9218. high = _bit_scan_reverse(bits);
  9219. mem[indexes[high]] = v[high];
  9220. high = (1 << high);
  9221. case 5:
  9222. low = _bit_scan_forward(bits);
  9223. bits ^= high | (1 << low);
  9224. mem[indexes[low]] = v[low];
  9225. case 4:
  9226. high = _bit_scan_reverse(bits);
  9227. mem[indexes[high]] = v[high];
  9228. high = (1 << high);
  9229. case 3:
  9230. low = _bit_scan_forward(bits);
  9231. bits ^= high | (1 << low);
  9232. mem[indexes[low]] = v[low];
  9233. case 2:
  9234. high = _bit_scan_reverse(bits);
  9235. mem[indexes[high]] = v[high];
  9236. case 1:
  9237. low = _bit_scan_forward(bits);
  9238. mem[indexes[low]] = v[low];
  9239. case 0:
  9240. break;
  9241. }
  9242. }
  9243. template <typename V, typename MT, typename IT>
  9244. Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
  9245. V &v,
  9246. MT *mem,
  9247. const IT &indexes,
  9248. typename V::MaskArgument mask,
  9249. enable_if<V::Size == 8> = nullarg)
  9250. {
  9251. unsigned int bits = mask.toInt();
  9252. unsigned int low, high = 0;
  9253. switch (Vc::Detail::popcnt8(bits)) {
  9254. case 8:
  9255. v.scatter(mem, indexes);
  9256. break;
  9257. case 7:
  9258. low = _bit_scan_forward(bits);
  9259. bits ^= 1 << low;
  9260. mem[indexes[low]] = v[low];
  9261. case 6:
  9262. high = _bit_scan_reverse(bits);
  9263. mem[indexes[high]] = v[high];
  9264. high = (1 << high);
  9265. case 5:
  9266. low = _bit_scan_forward(bits);
  9267. bits ^= high | (1 << low);
  9268. mem[indexes[low]] = v[low];
  9269. case 4:
  9270. high = _bit_scan_reverse(bits);
  9271. mem[indexes[high]] = v[high];
  9272. high = (1 << high);
  9273. case 3:
  9274. low = _bit_scan_forward(bits);
  9275. bits ^= high | (1 << low);
  9276. mem[indexes[low]] = v[low];
  9277. case 2:
  9278. high = _bit_scan_reverse(bits);
  9279. mem[indexes[high]] = v[high];
  9280. case 1:
  9281. low = _bit_scan_forward(bits);
  9282. mem[indexes[low]] = v[low];
  9283. case 0:
  9284. break;
  9285. }
  9286. }
  9287. template <typename V, typename MT, typename IT>
  9288. Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
  9289. V &v,
  9290. MT *mem,
  9291. const IT &indexes,
  9292. typename V::MaskArgument mask,
  9293. enable_if<V::Size == 4> = nullarg)
  9294. {
  9295. unsigned int bits = mask.toInt();
  9296. unsigned int low, high = 0;
  9297. switch (Vc::Detail::popcnt4(bits)) {
  9298. case 4:
  9299. v.scatter(mem, indexes);
  9300. break;
  9301. case 3:
  9302. low = _bit_scan_forward(bits);
  9303. bits ^= 1 << low;
  9304. mem[indexes[low]] = v[low];
  9305. case 2:
  9306. high = _bit_scan_reverse(bits);
  9307. mem[indexes[high]] = v[high];
  9308. case 1:
  9309. low = _bit_scan_forward(bits);
  9310. mem[indexes[low]] = v[low];
  9311. case 0:
  9312. break;
  9313. }
  9314. }
  9315. template <typename V, typename MT, typename IT>
  9316. Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
  9317. V &v,
  9318. MT *mem,
  9319. const IT &indexes,
  9320. typename V::MaskArgument mask,
  9321. enable_if<V::Size == 2> = nullarg)
  9322. {
  9323. unsigned int bits = mask.toInt();
  9324. unsigned int low;
  9325. switch (Vc::Detail::popcnt4(bits)) {
  9326. case 2:
  9327. v.scatter(mem, indexes);
  9328. break;
  9329. case 1:
  9330. low = _bit_scan_forward(bits);
  9331. mem[indexes[low]] = v[low];
  9332. case 0:
  9333. break;
  9334. }
  9335. }
  9336. }
  9337. }
  9338. #endif
  9339. namespace Vc_VERSIONED_NAMESPACE
  9340. {
  9341. namespace Detail
  9342. {
  9343. Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
  9344. Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
  9345. Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
  9346. Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
  9347. Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
  9348. Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
  9349. Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
  9350. Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
  9351. Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
  9352. Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
  9353. Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
  9354. Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
  9355. Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
  9356. Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
  9357. Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
  9358. Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) {
  9359. #ifndef USE_INCORRECT_UNSIGNED_COMPARE
  9360. return SSE::cmpgt_epu32(a.data(), b.data());
  9361. #else
  9362. return _mm_cmpgt_epi32(a.data(), b.data());
  9363. #endif
  9364. }
  9365. Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
  9366. Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
  9367. #ifndef USE_INCORRECT_UNSIGNED_COMPARE
  9368. return SSE::cmpgt_epu16(a.data(), b.data());
  9369. #else
  9370. return _mm_cmpgt_epi16(a.data(), b.data());
  9371. #endif
  9372. }
  9373. Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
  9374. Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
  9375. Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
  9376. Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) {
  9377. #ifndef USE_INCORRECT_UNSIGNED_COMPARE
  9378. return SSE::cmplt_epu32(a.data(), b.data());
  9379. #else
  9380. return _mm_cmplt_epi32(a.data(), b.data());
  9381. #endif
  9382. }
  9383. Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
  9384. Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
  9385. #ifndef USE_INCORRECT_UNSIGNED_COMPARE
  9386. return SSE::cmplt_epu16(a.data(), b.data());
  9387. #else
  9388. return _mm_cmplt_epi16(a.data(), b.data());
  9389. #endif
  9390. }
  9391. Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
  9392. Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
  9393. Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); }
  9394. Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); }
  9395. Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
  9396. Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
  9397. Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
  9398. Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
  9399. Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); }
  9400. Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); }
  9401. Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
  9402. Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
  9403. template <typename T>
  9404. Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
  9405. {
  9406. return xor_(a.data(), b.data());
  9407. }
  9408. template <typename T>
  9409. Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
  9410. {
  9411. return and_(a.data(), b.data());
  9412. }
  9413. template <typename T>
  9414. Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
  9415. {
  9416. return or_(a.data(), b.data());
  9417. }
  9418. template <typename T>
  9419. Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
  9420. {
  9421. return add(a.data(), b.data(), T());
  9422. }
  9423. template <typename T>
  9424. Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
  9425. {
  9426. return sub(a.data(), b.data(), T());
  9427. }
  9428. template <typename T>
  9429. Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
  9430. {
  9431. return mul(a.data(), b.data(), T());
  9432. }
  9433. template <typename T>
  9434. Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
  9435. SSE::Vector<T> a, SSE::Vector<T> b)
  9436. {
  9437. return div(a.data(), b.data(), T());
  9438. }
  9439. template <typename T>
  9440. Vc_INTRINSIC
  9441. enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
  9442. operator/(SSE::Vector<T> a, SSE::Vector<T> b)
  9443. {
  9444. return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
  9445. }
  9446. template <typename T>
  9447. Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
  9448. SSE::Vector<T>>
  9449. operator/(SSE::Vector<T> a, SSE::Vector<T> b)
  9450. {
  9451. using HT = SSE::VectorHelper<T>;
  9452. __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
  9453. __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
  9454. lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
  9455. hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
  9456. return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
  9457. }
  9458. template <typename T>
  9459. Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
  9460. SSE::Vector<T> a, SSE::Vector<T> b)
  9461. {
  9462. return a - a / b * b;
  9463. }
  9464. }
  9465. template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
  9466. : d(HV::zero())
  9467. {
  9468. }
  9469. template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
  9470. : d(HT::one())
  9471. {
  9472. }
  9473. template <typename T>
  9474. Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
  9475. : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
  9476. {
  9477. #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
  9478. if (std::is_same<T, short>::value) {
  9479. asm("" ::"x"(d.v()));
  9480. }
  9481. #endif
  9482. }
  9483. template <>
  9484. Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
  9485. : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
  9486. {
  9487. }
  9488. template <>
  9489. Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
  9490. : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
  9491. {
  9492. }
  9493. template <typename DstT>
  9494. template <typename SrcT, typename Flags>
  9495. Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
  9496. #ifndef Vc_MSVC
  9497. template
  9498. #endif
  9499. load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
  9500. {
  9501. Common::handleLoadPrefetches(mem, flags);
  9502. d.v() = Detail::load<VectorType, DstT>(mem, flags);
  9503. }
  9504. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
  9505. {
  9506. data() = HV::zero();
  9507. }
  9508. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
  9509. {
  9510. data() = Detail::andnot_(k.data(), data());
  9511. }
  9512. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
  9513. {
  9514. data() = Detail::and_(k.data(), data());
  9515. }
  9516. template<> Vc_INTRINSIC void SSE::double_v::setQnan()
  9517. {
  9518. data() = SSE::_mm_setallone_pd();
  9519. }
  9520. template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
  9521. {
  9522. data() = _mm_or_pd(data(), k.dataD());
  9523. }
  9524. template<> Vc_INTRINSIC void SSE::float_v::setQnan()
  9525. {
  9526. data() = SSE::_mm_setallone_ps();
  9527. }
  9528. template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
  9529. {
  9530. data() = _mm_or_ps(data(), k.dataF());
  9531. }
  9532. template <typename T>
  9533. template <typename U, typename Flags, typename>
  9534. Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
  9535. {
  9536. Common::handleStorePrefetches(mem, flags);
  9537. HV::template store<Flags>(mem, data());
  9538. }
  9539. template <typename T>
  9540. template <typename U, typename Flags, typename>
  9541. Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
  9542. {
  9543. Common::handleStorePrefetches(mem, flags);
  9544. HV::template store<Flags>(mem, data(), mask.data());
  9545. }
  9546. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
  9547. {
  9548. return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
  9549. }
  9550. #ifdef Vc_IMPL_XOP
  9551. template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
  9552. template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
  9553. template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
  9554. template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
  9555. template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); }
  9556. template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); }
  9557. template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); }
  9558. template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
  9559. #elif defined Vc_IMPL_AVX2
  9560. template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
  9561. template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
  9562. template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
  9563. template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
  9564. #endif
  9565. template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
  9566. d.v() = HT::shiftRight(d.v(), shift);
  9567. return *this;
  9568. }
  9569. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
  9570. return HT::shiftRight(d.v(), shift);
  9571. }
  9572. template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
  9573. d.v() = HT::shiftLeft(d.v(), shift);
  9574. return *this;
  9575. }
  9576. template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
  9577. return HT::shiftLeft(d.v(), shift);
  9578. }
  9579. Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
  9580. {
  9581. return sse_cast<__m128>(_mm_srai_epi32(
  9582. sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
  9583. }
  9584. Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
  9585. {
  9586. return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
  9587. sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
  9588. }
  9589. #define Vc_GATHER_IMPL(V_) \
  9590. template <> \
  9591. template <class MT, class IT, int Scale> \
  9592. inline void SSE::V_::gatherImplementation( \
  9593. const Common::GatherArguments<MT, IT, Scale> &args)
  9594. #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
  9595. Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
  9596. Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
  9597. Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
  9598. Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
  9599. Vc_GATHER_IMPL(short_v)
  9600. {
  9601. d.v() =
  9602. Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
  9603. }
  9604. Vc_GATHER_IMPL(ushort_v)
  9605. {
  9606. d.v() =
  9607. Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
  9608. }
  9609. #undef Vc_M
  9610. #undef Vc_GATHER_IMPL
  9611. template <typename T>
  9612. template <class MT, class IT, int Scale>
  9613. inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
  9614. const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
  9615. {
  9616. const auto *mem = args.address;
  9617. const auto indexes = Scale * args.indexes;
  9618. using Selector = std::integral_constant < Common::GatherScatterImplementation,
  9619. #ifdef Vc_USE_SET_GATHERS
  9620. Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
  9621. #endif
  9622. #ifdef Vc_USE_BSF_GATHERS
  9623. Common::GatherScatterImplementation::BitScanLoop
  9624. #elif defined Vc_USE_POPCNT_BSF_GATHERS
  9625. Common::GatherScatterImplementation::PopcntSwitch
  9626. #else
  9627. Common::GatherScatterImplementation::SimpleLoop
  9628. #endif
  9629. > ;
  9630. Common::executeGather(Selector(), *this, mem, indexes, mask);
  9631. }
  9632. template <typename T>
  9633. template <typename MT, typename IT>
  9634. inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
  9635. {
  9636. Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
  9637. }
  9638. template <typename T>
  9639. template <typename MT, typename IT>
  9640. inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
  9641. {
  9642. using Selector = std::integral_constant < Common::GatherScatterImplementation,
  9643. #ifdef Vc_USE_SET_GATHERS
  9644. Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
  9645. #endif
  9646. #ifdef Vc_USE_BSF_GATHERS
  9647. Common::GatherScatterImplementation::BitScanLoop
  9648. #elif defined Vc_USE_POPCNT_BSF_GATHERS
  9649. Common::GatherScatterImplementation::PopcntSwitch
  9650. #else
  9651. Common::GatherScatterImplementation::SimpleLoop
  9652. #endif
  9653. > ;
  9654. Common::executeScatter(Selector(), *this, mem, indexes, mask);
  9655. }
  9656. template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
  9657. {
  9658. Vector<T, VectorAbi::Sse> tmp = *this;
  9659. if (Size > 1) tmp += tmp.shifted(-1);
  9660. if (Size > 2) tmp += tmp.shifted(-2);
  9661. if (Size > 4) tmp += tmp.shifted(-4);
  9662. if (Size > 8) tmp += tmp.shifted(-8);
  9663. if (Size > 16) tmp += tmp.shifted(-16);
  9664. return tmp;
  9665. }
  9666. #ifndef Vc_IMPL_SSE4_1
  9667. template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
  9668. {
  9669. return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
  9670. }
  9671. template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
  9672. {
  9673. return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
  9674. }
  9675. #endif
  9676. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
  9677. {
  9678. Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
  9679. tmp(m) = *this;
  9680. return tmp.min();
  9681. }
  9682. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
  9683. {
  9684. Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
  9685. tmp(m) = *this;
  9686. return tmp.max();
  9687. }
  9688. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
  9689. {
  9690. Vector<T, VectorAbi::Sse> tmp(Vc::One);
  9691. tmp(m) = *this;
  9692. return tmp.product();
  9693. }
  9694. template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
  9695. {
  9696. Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
  9697. tmp(m) = *this;
  9698. return tmp.sum();
  9699. }
  9700. namespace Detail
  9701. {
  9702. Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
  9703. {
  9704. __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
  9705. tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
  9706. return _mm_cvtepi32_ps(tmp);
  9707. }
  9708. Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
  9709. {
  9710. __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
  9711. tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
  9712. return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
  9713. }
  9714. }
  9715. Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
  9716. {
  9717. using Detail::operator>=;
  9718. Vc_ASSERT((x >= x.Zero()).isFull());
  9719. return Detail::exponent(x.data());
  9720. }
  9721. Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
  9722. {
  9723. using Detail::operator>=;
  9724. Vc_ASSERT((x >= x.Zero()).isFull());
  9725. return Detail::exponent(x.data());
  9726. }
  9727. static void _doRandomStep(SSE::uint_v &state0,
  9728. SSE::uint_v &state1)
  9729. {
  9730. using SSE::uint_v;
  9731. using Detail::operator+;
  9732. using Detail::operator*;
  9733. state0.load(&Common::RandomState[0]);
  9734. state1.load(&Common::RandomState[uint_v::Size]);
  9735. (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
  9736. uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
  9737. _mm_srli_epi32(state1.data(), 16)))
  9738. .store(&Common::RandomState[0]);
  9739. }
  9740. template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
  9741. {
  9742. SSE::uint_v state0, state1;
  9743. _doRandomStep(state0, state1);
  9744. return state0.data();
  9745. }
  9746. template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
  9747. {
  9748. SSE::uint_v state0, state1;
  9749. _doRandomStep(state0, state1);
  9750. return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
  9751. }
  9752. template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
  9753. {
  9754. typedef unsigned long long uint64 Vc_MAY_ALIAS;
  9755. uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
  9756. uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
  9757. const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
  9758. *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
  9759. *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
  9760. return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
  9761. }
  9762. template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
  9763. {
  9764. enum {
  9765. EntryTypeSizeof = sizeof(EntryType)
  9766. };
  9767. switch (amount) {
  9768. case 0: return *this;
  9769. case 1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
  9770. case 2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
  9771. case 3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
  9772. case 4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
  9773. case 5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
  9774. case 6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
  9775. case 7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
  9776. case 8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
  9777. case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
  9778. case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
  9779. case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
  9780. case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
  9781. case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
  9782. case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
  9783. case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
  9784. case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
  9785. }
  9786. return Zero();
  9787. }
  9788. template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
  9789. {
  9790. if (amount >= -int(size())) {
  9791. constexpr int VectorWidth = int(size());
  9792. constexpr int EntryTypeSizeof = sizeof(EntryType);
  9793. const __m128i v0 = sse_cast<__m128i>(d.v());
  9794. const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
  9795. auto &&fixup = sse_cast<VectorType, __m128i>;
  9796. switch (amount) {
  9797. case 0: return *this;
  9798. case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
  9799. case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
  9800. case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
  9801. case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
  9802. case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
  9803. case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
  9804. case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
  9805. case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
  9806. case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
  9807. case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
  9808. case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
  9809. case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
  9810. case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
  9811. case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
  9812. case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
  9813. case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
  9814. case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
  9815. case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
  9816. case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
  9817. case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
  9818. case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
  9819. case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
  9820. case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
  9821. case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
  9822. case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
  9823. case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
  9824. case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
  9825. case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
  9826. case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
  9827. case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
  9828. }
  9829. }
  9830. return shiftIn.shifted(int(size()) + amount);
  9831. }
  9832. template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
  9833. {
  9834. enum {
  9835. EntryTypeSizeof = sizeof(EntryType)
  9836. };
  9837. const __m128i v = SSE::sse_cast<__m128i>(d.v());
  9838. switch (static_cast<unsigned int>(amount) % Size) {
  9839. case 0: return *this;
  9840. case 1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
  9841. case 2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
  9842. case 3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
  9843. case 4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
  9844. case 5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
  9845. case 6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
  9846. case 7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
  9847. }
  9848. return Zero();
  9849. }
  9850. namespace Detail
  9851. {
  9852. inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
  9853. {
  9854. const __m128d x = x_.data();
  9855. const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
  9856. return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
  9857. }
  9858. }
  9859. template <typename T>
  9860. Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
  9861. const
  9862. {
  9863. return Detail::sorted(*this);
  9864. }
  9865. template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
  9866. template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
  9867. template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
  9868. template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
  9869. template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
  9870. template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
  9871. template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
  9872. template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
  9873. template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
  9874. template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
  9875. template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
  9876. template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
  9877. template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
  9878. {
  9879. const auto tmp0 = gen(0);
  9880. const auto tmp1 = gen(1);
  9881. return _mm_setr_pd(tmp0, tmp1);
  9882. }
  9883. template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
  9884. {
  9885. const auto tmp0 = gen(0);
  9886. const auto tmp1 = gen(1);
  9887. const auto tmp2 = gen(2);
  9888. const auto tmp3 = gen(3);
  9889. return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
  9890. }
  9891. template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
  9892. {
  9893. const auto tmp0 = gen(0);
  9894. const auto tmp1 = gen(1);
  9895. const auto tmp2 = gen(2);
  9896. const auto tmp3 = gen(3);
  9897. return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
  9898. }
  9899. template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
  9900. {
  9901. const auto tmp0 = gen(0);
  9902. const auto tmp1 = gen(1);
  9903. const auto tmp2 = gen(2);
  9904. const auto tmp3 = gen(3);
  9905. return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
  9906. }
  9907. template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
  9908. {
  9909. const auto tmp0 = gen(0);
  9910. const auto tmp1 = gen(1);
  9911. const auto tmp2 = gen(2);
  9912. const auto tmp3 = gen(3);
  9913. const auto tmp4 = gen(4);
  9914. const auto tmp5 = gen(5);
  9915. const auto tmp6 = gen(6);
  9916. const auto tmp7 = gen(7);
  9917. return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  9918. }
  9919. template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
  9920. {
  9921. const auto tmp0 = gen(0);
  9922. const auto tmp1 = gen(1);
  9923. const auto tmp2 = gen(2);
  9924. const auto tmp3 = gen(3);
  9925. const auto tmp4 = gen(4);
  9926. const auto tmp5 = gen(5);
  9927. const auto tmp6 = gen(6);
  9928. const auto tmp7 = gen(7);
  9929. return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  9930. }
  9931. template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
  9932. {
  9933. return Mem::permute<X1, X0>(d.v());
  9934. }
  9935. template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
  9936. {
  9937. return Mem::permute<X3, X2, X1, X0>(d.v());
  9938. }
  9939. template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
  9940. {
  9941. return Mem::permute<X3, X2, X1, X0>(d.v());
  9942. }
  9943. template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
  9944. {
  9945. return Mem::permute<X3, X2, X1, X0>(d.v());
  9946. }
  9947. template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
  9948. {
  9949. return sse_cast<__m128i>(
  9950. Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
  9951. sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
  9952. }
  9953. template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
  9954. {
  9955. return sse_cast<__m128i>(
  9956. Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
  9957. sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
  9958. }
  9959. template <>
  9960. Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
  9961. #ifdef Vc_IMPL_AVX
  9962. perm
  9963. #endif
  9964. ) const
  9965. {
  9966. #ifdef Vc_IMPL_AVX
  9967. return _mm_permutevar_ps(d.v(), perm.data());
  9968. #else
  9969. return *this;
  9970. #endif
  9971. }
  9972. template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
  9973. {
  9974. constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
  9975. return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
  9976. }
  9977. template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
  9978. {
  9979. constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
  9980. return Mem::permute<Inner, Inner>(d.v());
  9981. }
  9982. namespace Common
  9983. {
  9984. Vc_ALWAYS_INLINE void transpose_impl(
  9985. TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
  9986. const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
  9987. {
  9988. const auto in0 = std::get<0>(proxy.in).data();
  9989. const auto in1 = std::get<1>(proxy.in).data();
  9990. const auto in2 = std::get<2>(proxy.in).data();
  9991. const auto in3 = std::get<3>(proxy.in).data();
  9992. const auto tmp0 = _mm_unpacklo_ps(in0, in2);
  9993. const auto tmp1 = _mm_unpacklo_ps(in1, in3);
  9994. const auto tmp2 = _mm_unpackhi_ps(in0, in2);
  9995. const auto tmp3 = _mm_unpackhi_ps(in1, in3);
  9996. *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
  9997. *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
  9998. *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
  9999. *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
  10000. }
  10001. }
  10002. }
  10003. #ifndef VC_SSE_SIMD_CAST_H_
  10004. #define VC_SSE_SIMD_CAST_H_
  10005. #ifdef Vc_IMPL_AVX
  10006. #ifndef VC_AVX_CASTS_H_
  10007. #define VC_AVX_CASTS_H_
  10008. #ifndef VC_AVX_SHUFFLE_H_
  10009. #define VC_AVX_SHUFFLE_H_
  10010. namespace Vc_VERSIONED_NAMESPACE
  10011. {
  10012. namespace Detail
  10013. {
  10014. template <int... Dst> struct Permutation {};
  10015. template <uint8_t... Sel> struct Mask {};
  10016. #ifdef Vc_IMPL_AVX2
  10017. template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
  10018. uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
  10019. uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
  10020. uint8_t Sel15>
  10021. Vc_INTRINSIC Vc_CONST __m256i
  10022. blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
  10023. Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
  10024. {
  10025. static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
  10026. (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
  10027. (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
  10028. (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
  10029. (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
  10030. (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
  10031. (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
  10032. (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
  10033. "Selectors must be 0 or 1 to select the value from a or b");
  10034. constexpr uint8_t mask = static_cast<uint8_t>(
  10035. (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) |
  10036. (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) |
  10037. (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
  10038. (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
  10039. return _mm256_blend_epi16(a, b, mask);
  10040. }
  10041. #endif
  10042. }
  10043. namespace Mem
  10044. {
  10045. #ifdef Vc_IMPL_AVX2
  10046. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
  10047. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  10048. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10049. return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10050. }
  10051. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
  10052. static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
  10053. static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
  10054. return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
  10055. }
  10056. #endif
  10057. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
  10058. static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
  10059. static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
  10060. return _mm256_permute2f128_ps(
  10061. x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
  10062. }
  10063. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
  10064. static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
  10065. static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
  10066. return _mm256_permute2f128_pd(
  10067. x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
  10068. }
  10069. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
  10070. static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
  10071. static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
  10072. #ifdef Vc_IMPL_AVX2
  10073. return _mm256_permute2x128_si256(
  10074. x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
  10075. #else
  10076. return _mm256_permute2f128_si256(
  10077. x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
  10078. #endif
  10079. }
  10080. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
  10081. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10082. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10083. return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10084. }
  10085. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
  10086. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10087. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10088. #ifdef Vc_IMPL_AVX2
  10089. return _mm256_permute2x128_si256(
  10090. x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10091. #else
  10092. return _mm256_permute2f128_si256(
  10093. x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10094. #endif
  10095. }
  10096. template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
  10097. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10098. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10099. return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10100. }
  10101. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
  10102. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
  10103. static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10104. return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
  10105. }
  10106. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
  10107. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  10108. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10109. return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10110. }
  10111. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
  10112. return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
  10113. }
  10114. #ifdef Vc_IMPL_AVX2
  10115. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
  10116. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  10117. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10118. return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10119. }
  10120. #endif
  10121. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
  10122. static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
  10123. static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
  10124. return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
  10125. }
  10126. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
  10127. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
  10128. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
  10129. return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
  10130. }
  10131. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  10132. static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
  10133. static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
  10134. static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
  10135. static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
  10136. static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
  10137. static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
  10138. static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
  10139. static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
  10140. static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
  10141. return _mm256_blend_ps(x, y,
  10142. (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
  10143. (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
  10144. (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
  10145. (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
  10146. );
  10147. }
  10148. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  10149. static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
  10150. return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
  10151. }
  10152. template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
  10153. template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  10154. static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
  10155. static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
  10156. static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
  10157. static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
  10158. static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
  10159. static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
  10160. static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
  10161. static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
  10162. static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
  10163. if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
  10164. return permute<Dst0, Dst1, Dst2, Dst3>(x);
  10165. }
  10166. const __m128 loIn = _mm256_castps256_ps128(x);
  10167. const __m128 hiIn = _mm256_extractf128_ps(x, 1);
  10168. __m128 lo, hi;
  10169. if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
  10170. lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10171. } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
  10172. lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10173. } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
  10174. lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
  10175. } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
  10176. lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
  10177. } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
  10178. lo = _mm_unpacklo_ps(loIn, hiIn);
  10179. } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
  10180. lo = _mm_unpacklo_ps(hiIn, loIn);
  10181. } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
  10182. lo = _mm_unpackhi_ps(loIn, hiIn);
  10183. } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
  10184. lo = _mm_unpackhi_ps(hiIn, loIn);
  10185. } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
  10186. lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
  10187. ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
  10188. }
  10189. if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
  10190. hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
  10191. } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
  10192. hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
  10193. } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
  10194. hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
  10195. } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
  10196. hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
  10197. } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
  10198. hi = _mm_unpacklo_ps(loIn, hiIn);
  10199. } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
  10200. hi = _mm_unpacklo_ps(hiIn, loIn);
  10201. } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
  10202. hi = _mm_unpackhi_ps(loIn, hiIn);
  10203. } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
  10204. hi = _mm_unpackhi_ps(hiIn, loIn);
  10205. } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
  10206. hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
  10207. ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
  10208. }
  10209. return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
  10210. }
  10211. }
  10212. }
  10213. namespace Vc_VERSIONED_NAMESPACE
  10214. {
  10215. namespace Reg
  10216. {
  10217. template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
  10218. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10219. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10220. return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10221. }
  10222. template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
  10223. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10224. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10225. #ifdef Vc_IMPL_AVX2
  10226. return _mm256_permute2x128_si256(
  10227. x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10228. #else
  10229. return _mm256_permute2f128_si256(
  10230. x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10231. #endif
  10232. }
  10233. template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
  10234. static_assert(L >= X0 && H >= X0, "Incorrect_Range");
  10235. static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
  10236. return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  10237. }
  10238. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
  10239. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
  10240. static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10241. return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
  10242. }
  10243. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
  10244. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  10245. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10246. return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10247. }
  10248. template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
  10249. static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
  10250. static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
  10251. return _mm_permute_pd(x, Dst0 + Dst1 * 2);
  10252. }
  10253. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
  10254. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
  10255. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
  10256. return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  10257. }
  10258. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
  10259. static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
  10260. static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
  10261. return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
  10262. }
  10263. template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
  10264. static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
  10265. static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
  10266. return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
  10267. }
  10268. }
  10269. }
  10270. #endif
  10271. namespace Vc_VERSIONED_NAMESPACE
  10272. {
  10273. namespace AVX
  10274. {
  10275. namespace Casts
  10276. {
  10277. template<typename T> Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R;
  10278. template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
  10279. template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
  10280. template<typename T> Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R;
  10281. template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
  10282. template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
  10283. template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; }
  10284. template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
  10285. template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); }
  10286. template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); }
  10287. template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
  10288. template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
  10289. template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); }
  10290. template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
  10291. template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
  10292. template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); }
  10293. template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
  10294. template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
  10295. template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
  10296. template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
  10297. template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
  10298. template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
  10299. template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
  10300. template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
  10301. #if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
  10302. static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
  10303. static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
  10304. static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
  10305. #else
  10306. static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
  10307. static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
  10308. static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
  10309. #endif
  10310. template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); }
  10311. template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
  10312. template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
  10313. template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
  10314. template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
  10315. template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
  10316. template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
  10317. template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
  10318. template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
  10319. template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; }
  10320. template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
  10321. template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
  10322. template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); }
  10323. template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
  10324. template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
  10325. template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); }
  10326. template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
  10327. template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
  10328. Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); }
  10329. Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
  10330. Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
  10331. Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); }
  10332. Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
  10333. Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
  10334. Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); }
  10335. Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
  10336. Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
  10337. }
  10338. using namespace Casts;
  10339. }
  10340. namespace AVX2
  10341. {
  10342. using namespace AVX::Casts;
  10343. }
  10344. namespace AVX
  10345. {
  10346. template <typename From, typename To> struct ConvertTag {};
  10347. Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
  10348. Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
  10349. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , int>) { return v; }
  10350. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , int>) { return v; }
  10351. Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
  10352. #ifdef Vc_IMPL_AVX2
  10353. return _mm256_cvtepi16_epi32(v);
  10354. #else
  10355. return AVX::srai_epi32<16>(
  10356. concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
  10357. #endif
  10358. }
  10359. Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
  10360. #ifdef Vc_IMPL_AVX2
  10361. return _mm256_cvtepu16_epi32(v);
  10362. #else
  10363. return AVX::srli_epi32<16>(
  10364. concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
  10365. #endif
  10366. }
  10367. Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , uint>) {
  10368. using namespace AVX;
  10369. return _mm256_castps_si256(_mm256_blendv_ps(
  10370. _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
  10371. _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
  10372. set2power31_epu32())),
  10373. cmpge_ps(v, set2power31_ps())));
  10374. }
  10375. Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
  10376. using namespace AVX;
  10377. return _mm_xor_si128(
  10378. _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
  10379. _mm_set2power31_epu32());
  10380. }
  10381. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , uint>) { return v; }
  10382. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , uint>) { return v; }
  10383. Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
  10384. #ifdef Vc_IMPL_AVX2
  10385. return _mm256_cvtepi16_epi32(v);
  10386. #else
  10387. return AVX::srai_epi32<16>(
  10388. concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
  10389. #endif
  10390. }
  10391. Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
  10392. #ifdef Vc_IMPL_AVX2
  10393. return _mm256_cvtepu16_epi32(v);
  10394. #else
  10395. return AVX::srli_epi32<16>(
  10396. concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
  10397. #endif
  10398. }
  10399. Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag<float , float>) { return v; }
  10400. Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
  10401. Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<int , float>) { return _mm256_cvtepi32_ps(v); }
  10402. Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<uint , float>) {
  10403. using namespace AVX;
  10404. return _mm256_blendv_ps(
  10405. _mm256_cvtepi32_ps(v),
  10406. _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
  10407. _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
  10408. v, set1_epi32(0x000001ff))))),
  10409. _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
  10410. }
  10411. Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
  10412. Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
  10413. Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
  10414. Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
  10415. Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int , double>) { return _mm256_cvtepi32_pd(v); }
  10416. Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint , double>) {
  10417. using namespace AVX;
  10418. return _mm256_add_pd(
  10419. _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
  10420. set1_pd(1u << 31)); }
  10421. Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
  10422. Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
  10423. Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , short>) {
  10424. #ifdef Vc_IMPL_AVX2
  10425. auto a = _mm256_shuffle_epi8(
  10426. v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
  10427. -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
  10428. -0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
  10429. return lo128(_mm256_permute4x64_epi64(a, 0xf8));
  10430. #else
  10431. const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
  10432. const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
  10433. const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  10434. const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  10435. return _mm_unpacklo_epi16(tmp2, tmp3);
  10436. #endif
  10437. }
  10438. Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , short>) { return convert(v, ConvertTag<int, short>()); }
  10439. Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
  10440. Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
  10441. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
  10442. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
  10443. Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , ushort>) {
  10444. auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
  10445. auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
  10446. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  10447. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  10448. return _mm_unpacklo_epi16(tmp2, tmp3);
  10449. }
  10450. Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , ushort>) {
  10451. auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
  10452. auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
  10453. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  10454. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  10455. return _mm_unpacklo_epi16(tmp2, tmp3);
  10456. }
  10457. Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
  10458. Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
  10459. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
  10460. Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
  10461. template <typename From, typename To>
  10462. Vc_INTRINSIC auto convert(
  10463. typename std::conditional<(sizeof(From) < sizeof(To)),
  10464. typename SSE::VectorTraits<From>::VectorType,
  10465. typename AVX::VectorTypeHelper<From>::Type>::type v)
  10466. -> decltype(convert(v, ConvertTag<From, To>()))
  10467. {
  10468. return convert(v, ConvertTag<From, To>());
  10469. }
  10470. template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
  10471. Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
  10472. -> decltype(convert(lo128(v), ConvertTag<From, To>()))
  10473. {
  10474. return convert(lo128(v), ConvertTag<From, To>());
  10475. }
  10476. }
  10477. }
  10478. #endif
  10479. #endif
  10480. #ifndef VC_SSE_VECTOR_H_
  10481. #error "Vc/sse/vector.h needs to be included before Vc/sse/simd_cast.h"
  10482. #endif
  10483. namespace Vc_VERSIONED_NAMESPACE
  10484. {
  10485. namespace SSE
  10486. {
  10487. #define Vc_SIMD_CAST_1(from_,to_) \
  10488. template <typename To> \
  10489. Vc_INTRINSIC Vc_CONST To simd_cast( \
  10490. from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
  10491. #define Vc_SIMD_CAST_2(from_,to_) \
  10492. template <typename To> \
  10493. Vc_INTRINSIC Vc_CONST To simd_cast( \
  10494. from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
  10495. #define Vc_SIMD_CAST_4(from_,to_) \
  10496. template <typename To> \
  10497. Vc_INTRINSIC Vc_CONST To simd_cast( \
  10498. from_ x0, from_ x1, from_ x2, from_ x3, \
  10499. enable_if<std::is_same<To, to_>::value> = nullarg)
  10500. #define Vc_SIMD_CAST_8(from_,to_) \
  10501. template <typename To> \
  10502. Vc_INTRINSIC Vc_CONST To simd_cast( \
  10503. from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
  10504. enable_if<std::is_same<To, to_>::value> = nullarg)
  10505. Vc_SIMD_CAST_1( float_v, int_v);
  10506. Vc_SIMD_CAST_1(double_v, int_v);
  10507. Vc_SIMD_CAST_1( uint_v, int_v);
  10508. Vc_SIMD_CAST_1( short_v, int_v);
  10509. Vc_SIMD_CAST_1(ushort_v, int_v);
  10510. Vc_SIMD_CAST_1( float_v, uint_v);
  10511. Vc_SIMD_CAST_1(double_v, uint_v);
  10512. Vc_SIMD_CAST_1( int_v, uint_v);
  10513. Vc_SIMD_CAST_1( short_v, uint_v);
  10514. Vc_SIMD_CAST_1(ushort_v, uint_v);
  10515. Vc_SIMD_CAST_1(double_v, float_v);
  10516. Vc_SIMD_CAST_1( int_v, float_v);
  10517. Vc_SIMD_CAST_1( uint_v, float_v);
  10518. Vc_SIMD_CAST_1( short_v, float_v);
  10519. Vc_SIMD_CAST_1(ushort_v, float_v);
  10520. Vc_SIMD_CAST_1( float_v, double_v);
  10521. Vc_SIMD_CAST_1( int_v, double_v);
  10522. Vc_SIMD_CAST_1( uint_v, double_v);
  10523. Vc_SIMD_CAST_1( short_v, double_v);
  10524. Vc_SIMD_CAST_1(ushort_v, double_v);
  10525. Vc_SIMD_CAST_1( int_v, short_v);
  10526. Vc_SIMD_CAST_1( uint_v, short_v);
  10527. Vc_SIMD_CAST_1( float_v, short_v);
  10528. Vc_SIMD_CAST_1(double_v, short_v);
  10529. Vc_SIMD_CAST_1(ushort_v, short_v);
  10530. Vc_SIMD_CAST_1( int_v, ushort_v);
  10531. Vc_SIMD_CAST_1( uint_v, ushort_v);
  10532. Vc_SIMD_CAST_1( float_v, ushort_v);
  10533. Vc_SIMD_CAST_1(double_v, ushort_v);
  10534. Vc_SIMD_CAST_1( short_v, ushort_v);
  10535. Vc_SIMD_CAST_2(double_v, int_v);
  10536. Vc_SIMD_CAST_2(double_v, uint_v);
  10537. Vc_SIMD_CAST_2(double_v, float_v);
  10538. Vc_SIMD_CAST_2( int_v, short_v);
  10539. Vc_SIMD_CAST_2( uint_v, short_v);
  10540. Vc_SIMD_CAST_2( float_v, short_v);
  10541. Vc_SIMD_CAST_2(double_v, short_v);
  10542. Vc_SIMD_CAST_2( int_v, ushort_v);
  10543. Vc_SIMD_CAST_2( uint_v, ushort_v);
  10544. Vc_SIMD_CAST_2( float_v, ushort_v);
  10545. Vc_SIMD_CAST_2(double_v, ushort_v);
  10546. #define Vc_CAST_(To_) \
  10547. template <typename Return> \
  10548. Vc_INTRINSIC Vc_CONST enable_if<std::is_same<Return, To_>::value, Return>
  10549. Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c);
  10550. Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c);
  10551. Vc_SIMD_CAST_4(double_v, short_v);
  10552. Vc_SIMD_CAST_4(double_v, ushort_v);
  10553. }
  10554. using SSE::simd_cast;
  10555. template <typename Return, typename T>
  10556. Vc_INTRINSIC Vc_CONST Return
  10557. simd_cast(Scalar::Vector<T> x,
  10558. enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
  10559. template <typename Return, typename T>
  10560. Vc_INTRINSIC Vc_CONST Return
  10561. simd_cast(Scalar::Vector<T> x,
  10562. enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
  10563. template <typename Return, typename T>
  10564. Vc_INTRINSIC Vc_CONST Return
  10565. simd_cast(Scalar::Vector<T> x,
  10566. enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
  10567. template <typename Return, typename T>
  10568. Vc_INTRINSIC Vc_CONST Return
  10569. simd_cast(Scalar::Vector<T> x,
  10570. enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
  10571. template <typename Return, typename T>
  10572. Vc_INTRINSIC Vc_CONST Return
  10573. simd_cast(Scalar::Vector<T> x,
  10574. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10575. template <typename Return, typename T>
  10576. Vc_INTRINSIC Vc_CONST Return
  10577. simd_cast(Scalar::Vector<T> x,
  10578. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10579. template <typename Return, typename T>
  10580. Vc_INTRINSIC Vc_CONST Return
  10581. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10582. enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
  10583. template <typename Return, typename T>
  10584. Vc_INTRINSIC Vc_CONST Return
  10585. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10586. enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
  10587. template <typename Return, typename T>
  10588. Vc_INTRINSIC Vc_CONST Return
  10589. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10590. enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
  10591. template <typename Return, typename T>
  10592. Vc_INTRINSIC Vc_CONST Return
  10593. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10594. enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
  10595. template <typename Return, typename T>
  10596. Vc_INTRINSIC Vc_CONST Return
  10597. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10598. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10599. template <typename Return, typename T>
  10600. Vc_INTRINSIC Vc_CONST Return
  10601. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  10602. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10603. template <typename Return, typename T>
  10604. Vc_INTRINSIC Vc_CONST Return
  10605. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
  10606. enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
  10607. template <typename Return, typename T>
  10608. Vc_INTRINSIC Vc_CONST Return
  10609. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
  10610. enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
  10611. template <typename Return, typename T>
  10612. Vc_INTRINSIC Vc_CONST Return
  10613. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
  10614. enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
  10615. template <typename Return, typename T>
  10616. Vc_INTRINSIC Vc_CONST Return
  10617. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
  10618. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10619. template <typename Return, typename T>
  10620. Vc_INTRINSIC Vc_CONST Return
  10621. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
  10622. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10623. template <typename Return, typename T>
  10624. Vc_INTRINSIC Vc_CONST Return
  10625. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10626. Scalar::Vector<T> x3,
  10627. enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
  10628. template <typename Return, typename T>
  10629. Vc_INTRINSIC Vc_CONST Return
  10630. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10631. Scalar::Vector<T> x3,
  10632. enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
  10633. template <typename Return, typename T>
  10634. Vc_INTRINSIC Vc_CONST Return
  10635. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10636. Scalar::Vector<T> x3,
  10637. enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
  10638. template <typename Return, typename T>
  10639. Vc_INTRINSIC Vc_CONST Return
  10640. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10641. Scalar::Vector<T> x3,
  10642. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10643. template <typename Return, typename T>
  10644. Vc_INTRINSIC Vc_CONST Return
  10645. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10646. Scalar::Vector<T> x3,
  10647. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10648. template <typename Return, typename T>
  10649. Vc_INTRINSIC Vc_CONST Return
  10650. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10651. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  10652. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10653. template <typename Return, typename T>
  10654. Vc_INTRINSIC Vc_CONST Return
  10655. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10656. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  10657. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10658. template <typename Return, typename T>
  10659. Vc_INTRINSIC Vc_CONST Return
  10660. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10661. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10662. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10663. template <typename Return, typename T>
  10664. Vc_INTRINSIC Vc_CONST Return
  10665. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10666. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10667. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10668. template <typename Return, typename T>
  10669. Vc_INTRINSIC Vc_CONST Return
  10670. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10671. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10672. Scalar::Vector<T> x6,
  10673. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10674. template <typename Return, typename T>
  10675. Vc_INTRINSIC Vc_CONST Return
  10676. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10677. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10678. Scalar::Vector<T> x6,
  10679. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10680. template <typename Return, typename T>
  10681. Vc_INTRINSIC Vc_CONST Return
  10682. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10683. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10684. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  10685. enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
  10686. template <typename Return, typename T>
  10687. Vc_INTRINSIC Vc_CONST Return
  10688. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10689. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  10690. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  10691. enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
  10692. template <typename To, typename FromT>
  10693. Vc_INTRINSIC Vc_CONST To
  10694. simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> = nullarg);
  10695. #undef Vc_SIMD_CAST_1
  10696. #undef Vc_SIMD_CAST_2
  10697. #undef Vc_SIMD_CAST_4
  10698. #undef Vc_SIMD_CAST_8
  10699. #define Vc_SIMD_CAST_1(from_,to_) \
  10700. template <typename To> \
  10701. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
  10702. #define Vc_SIMD_CAST_2(from_,to_) \
  10703. template <typename To> \
  10704. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
  10705. enable_if<std::is_same<To, to_>::value>)
  10706. #define Vc_SIMD_CAST_4(from_,to_) \
  10707. template <typename To> \
  10708. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
  10709. enable_if<std::is_same<To, to_>::value>)
  10710. #define Vc_SIMD_CAST_8(from_,to_) \
  10711. template <typename To> \
  10712. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  10713. from_ x5, from_ x6, from_ x7, \
  10714. enable_if<std::is_same<To, to_>::value>)
  10715. namespace SSE
  10716. {
  10717. Vc_INTRINSIC __m128i convert_int32_to_int16(__m128i a, __m128i b)
  10718. {
  10719. auto tmp0 = _mm_unpacklo_epi16(a, b);
  10720. auto tmp1 = _mm_unpackhi_epi16(a, b);
  10721. auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  10722. auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  10723. return _mm_unpacklo_epi16(tmp2, tmp3);
  10724. }
  10725. Vc_SIMD_CAST_1( float_v, int_v) { return convert< float, int>(x.data()); }
  10726. Vc_SIMD_CAST_1(double_v, int_v) { return convert<double, int>(x.data()); }
  10727. Vc_SIMD_CAST_1( uint_v, int_v) { return convert< uint, int>(x.data()); }
  10728. Vc_SIMD_CAST_1( short_v, int_v) { return convert< short, int>(x.data()); }
  10729. Vc_SIMD_CAST_1(ushort_v, int_v) { return convert<ushort, int>(x.data()); }
  10730. Vc_SIMD_CAST_1( float_v, uint_v) { return convert< float, uint>(x.data()); }
  10731. Vc_SIMD_CAST_1(double_v, uint_v) { return convert<double, uint>(x.data()); }
  10732. Vc_SIMD_CAST_1( int_v, uint_v) { return convert< int, uint>(x.data()); }
  10733. Vc_SIMD_CAST_1( short_v, uint_v) { return convert< short, uint>(x.data()); }
  10734. Vc_SIMD_CAST_1(ushort_v, uint_v) { return convert<ushort, uint>(x.data()); }
  10735. Vc_SIMD_CAST_1(double_v, float_v) { return convert<double, float>(x.data()); }
  10736. Vc_SIMD_CAST_1( int_v, float_v) { return convert< int, float>(x.data()); }
  10737. Vc_SIMD_CAST_1( uint_v, float_v) { return convert< uint, float>(x.data()); }
  10738. Vc_SIMD_CAST_1( short_v, float_v) { return convert< short, float>(x.data()); }
  10739. Vc_SIMD_CAST_1(ushort_v, float_v) { return convert<ushort, float>(x.data()); }
  10740. Vc_SIMD_CAST_1( float_v, double_v) { return convert< float, double>(x.data()); }
  10741. Vc_SIMD_CAST_1( int_v, double_v) { return convert< int, double>(x.data()); }
  10742. Vc_SIMD_CAST_1( uint_v, double_v) { return convert< uint, double>(x.data()); }
  10743. Vc_SIMD_CAST_1( short_v, double_v) { return convert< short, double>(x.data()); }
  10744. Vc_SIMD_CAST_1(ushort_v, double_v) { return convert<ushort, double>(x.data()); }
  10745. Vc_SIMD_CAST_1( int_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
  10746. Vc_SIMD_CAST_1( uint_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
  10747. Vc_SIMD_CAST_1( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
  10748. Vc_SIMD_CAST_1(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
  10749. Vc_SIMD_CAST_1(ushort_v, short_v) { return x.data(); }
  10750. Vc_SIMD_CAST_1( int_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
  10751. Vc_SIMD_CAST_1( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
  10752. Vc_SIMD_CAST_1( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
  10753. Vc_SIMD_CAST_1(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
  10754. Vc_SIMD_CAST_1( short_v, ushort_v) { return x.data(); }
  10755. Vc_SIMD_CAST_2(double_v, int_v) {
  10756. #ifdef Vc_IMPL_AVX
  10757. return AVX::convert<double, int>(AVX::concat(x0.data(), x1.data()));
  10758. #else
  10759. return _mm_unpacklo_epi64(convert<double, int>(x0.data()), convert<double, int>(x1.data()));
  10760. #endif
  10761. }
  10762. Vc_SIMD_CAST_2(double_v, uint_v) {
  10763. #ifdef Vc_IMPL_AVX
  10764. return AVX::convert<double, uint>(AVX::concat(x0.data(), x1.data()));
  10765. #else
  10766. return _mm_unpacklo_epi64(convert<double, uint>(x0.data()), convert<double, uint>(x1.data()));
  10767. #endif
  10768. }
  10769. Vc_SIMD_CAST_2(double_v, float_v) {
  10770. #ifdef Vc_IMPL_AVX
  10771. return _mm256_cvtpd_ps(AVX::concat(x0.data(), x1.data()));
  10772. #else
  10773. return _mm_movelh_ps(_mm_cvtpd_ps(x0.data()), _mm_cvtpd_ps(x1.data()));
  10774. #endif
  10775. }
  10776. Vc_SIMD_CAST_2( int_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
  10777. Vc_SIMD_CAST_2( uint_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
  10778. Vc_SIMD_CAST_2( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0).data(), simd_cast<SSE::int_v>(x1).data()); }
  10779. Vc_SIMD_CAST_2(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), _mm_setzero_si128()); }
  10780. Vc_SIMD_CAST_2( int_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
  10781. Vc_SIMD_CAST_2( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
  10782. Vc_SIMD_CAST_2( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0), simd_cast<SSE::int_v>(x1)); }
  10783. Vc_SIMD_CAST_2(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1)); }
  10784. Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c)
  10785. {
  10786. return simd_cast<short_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
  10787. }
  10788. Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c)
  10789. {
  10790. return simd_cast<ushort_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
  10791. }
  10792. #undef Vc_CAST_
  10793. Vc_SIMD_CAST_4(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), simd_cast<SSE::int_v>(x2, x3).data()); }
  10794. Vc_SIMD_CAST_4(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1), simd_cast<SSE::int_v>(x2, x3)); }
  10795. }
  10796. template <typename Return, typename T>
  10797. Vc_INTRINSIC Vc_CONST Return
  10798. simd_cast(Scalar::Vector<T> x,
  10799. enable_if<std::is_same<Return, SSE::double_v>::value> )
  10800. {
  10801. return _mm_setr_pd(x.data(), 0.);
  10802. }
  10803. template <typename Return, typename T>
  10804. Vc_INTRINSIC Vc_CONST Return
  10805. simd_cast(Scalar::Vector<T> x,
  10806. enable_if<std::is_same<Return, SSE::float_v>::value> )
  10807. {
  10808. return _mm_setr_ps(x.data(), 0.f, 0.f, 0.f);
  10809. }
  10810. template <typename Return, typename T>
  10811. Vc_INTRINSIC Vc_CONST Return
  10812. simd_cast(Scalar::Vector<T> x,
  10813. enable_if<std::is_same<Return, SSE::int_v>::value> )
  10814. {
  10815. return _mm_setr_epi32(x.data(), 0, 0, 0);
  10816. }
  10817. template <typename Return, typename T>
  10818. Vc_INTRINSIC Vc_CONST Return
  10819. simd_cast(Scalar::Vector<T> x,
  10820. enable_if<std::is_same<Return, SSE::uint_v>::value> )
  10821. {
  10822. return _mm_setr_epi32(uint(x.data()), 0, 0, 0);
  10823. }
  10824. template <typename Return, typename T>
  10825. Vc_INTRINSIC Vc_CONST Return
  10826. simd_cast(Scalar::Vector<T> x,
  10827. enable_if<std::is_same<Return, SSE::short_v>::value> )
  10828. {
  10829. return _mm_setr_epi16(
  10830. x.data(), 0, 0, 0, 0, 0, 0, 0);
  10831. }
  10832. template <typename Return, typename T>
  10833. Vc_INTRINSIC Vc_CONST Return
  10834. simd_cast(Scalar::Vector<T> x,
  10835. enable_if<std::is_same<Return, SSE::ushort_v>::value> )
  10836. {
  10837. return _mm_setr_epi16(
  10838. x.data(), 0, 0, 0, 0, 0, 0, 0);
  10839. }
  10840. template <typename Return, typename T>
  10841. Vc_INTRINSIC Vc_CONST Return
  10842. simd_cast(Scalar::Vector<T> x0,
  10843. Scalar::Vector<T> x1,
  10844. enable_if<std::is_same<Return, SSE::double_v>::value> )
  10845. {
  10846. return _mm_setr_pd(x0.data(), x1.data());
  10847. }
  10848. template <typename Return, typename T>
  10849. Vc_INTRINSIC Vc_CONST Return
  10850. simd_cast(Scalar::Vector<T> x0,
  10851. Scalar::Vector<T> x1,
  10852. enable_if<std::is_same<Return, SSE::float_v>::value> )
  10853. {
  10854. return _mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f);
  10855. }
  10856. template <typename Return, typename T>
  10857. Vc_INTRINSIC Vc_CONST Return
  10858. simd_cast(Scalar::Vector<T> x0,
  10859. Scalar::Vector<T> x1,
  10860. enable_if<std::is_same<Return, SSE::int_v>::value> )
  10861. {
  10862. return _mm_setr_epi32(x0.data(), x1.data(), 0, 0);
  10863. }
  10864. template <typename Return, typename T>
  10865. Vc_INTRINSIC Vc_CONST Return
  10866. simd_cast(Scalar::Vector<T> x0,
  10867. Scalar::Vector<T> x1,
  10868. enable_if<std::is_same<Return, SSE::uint_v>::value> )
  10869. {
  10870. return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), 0,
  10871. 0);
  10872. }
  10873. template <typename Return, typename T>
  10874. Vc_INTRINSIC Vc_CONST Return
  10875. simd_cast(Scalar::Vector<T> x0,
  10876. Scalar::Vector<T> x1,
  10877. enable_if<std::is_same<Return, SSE::short_v>::value> )
  10878. {
  10879. return _mm_setr_epi16(
  10880. x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
  10881. }
  10882. template <typename Return, typename T>
  10883. Vc_INTRINSIC Vc_CONST Return
  10884. simd_cast(Scalar::Vector<T> x0,
  10885. Scalar::Vector<T> x1,
  10886. enable_if<std::is_same<Return, SSE::ushort_v>::value> )
  10887. {
  10888. return _mm_setr_epi16(
  10889. x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
  10890. }
  10891. template <typename Return, typename T>
  10892. Vc_INTRINSIC Vc_CONST Return
  10893. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10894. enable_if<std::is_same<Return, SSE::float_v>::value>)
  10895. {
  10896. return _mm_setr_ps(x0.data(), x1.data(), x2.data(), 0.f);
  10897. }
  10898. template <typename Return, typename T>
  10899. Vc_INTRINSIC Vc_CONST Return
  10900. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10901. enable_if<std::is_same<Return, SSE::int_v>::value>)
  10902. {
  10903. return _mm_setr_epi32(x0.data(), x1.data(), x2.data(), 0);
  10904. }
  10905. template <typename Return, typename T>
  10906. Vc_INTRINSIC Vc_CONST Return
  10907. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10908. enable_if<std::is_same<Return, SSE::uint_v>::value>)
  10909. {
  10910. return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  10911. 0);
  10912. }
  10913. template <typename Return, typename T>
  10914. Vc_INTRINSIC Vc_CONST Return
  10915. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10916. enable_if<std::is_same<Return, SSE::short_v>::value>)
  10917. {
  10918. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
  10919. }
  10920. template <typename Return, typename T>
  10921. Vc_INTRINSIC Vc_CONST Return
  10922. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10923. enable_if<std::is_same<Return, SSE::ushort_v>::value>)
  10924. {
  10925. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
  10926. }
  10927. template <typename Return, typename T>
  10928. Vc_INTRINSIC Vc_CONST Return
  10929. simd_cast(Scalar::Vector<T> x0,
  10930. Scalar::Vector<T> x1,
  10931. Scalar::Vector<T> x2,
  10932. Scalar::Vector<T> x3,
  10933. enable_if<std::is_same<Return, SSE::float_v>::value> )
  10934. {
  10935. return _mm_setr_ps(
  10936. x0.data(), x1.data(), x2.data(), x3.data());
  10937. }
  10938. template <typename Return, typename T>
  10939. Vc_INTRINSIC Vc_CONST Return
  10940. simd_cast(Scalar::Vector<T> x0,
  10941. Scalar::Vector<T> x1,
  10942. Scalar::Vector<T> x2,
  10943. Scalar::Vector<T> x3,
  10944. enable_if<std::is_same<Return, SSE::int_v>::value> )
  10945. {
  10946. return _mm_setr_epi32(
  10947. x0.data(), x1.data(), x2.data(), x3.data());
  10948. }
  10949. template <typename Return, typename T>
  10950. Vc_INTRINSIC Vc_CONST Return
  10951. simd_cast(Scalar::Vector<T> x0,
  10952. Scalar::Vector<T> x1,
  10953. Scalar::Vector<T> x2,
  10954. Scalar::Vector<T> x3,
  10955. enable_if<std::is_same<Return, SSE::uint_v>::value> )
  10956. {
  10957. return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  10958. uint(x3.data()));
  10959. }
  10960. template <typename Return, typename T>
  10961. Vc_INTRINSIC Vc_CONST Return
  10962. simd_cast(Scalar::Vector<T> x0,
  10963. Scalar::Vector<T> x1,
  10964. Scalar::Vector<T> x2,
  10965. Scalar::Vector<T> x3,
  10966. enable_if<std::is_same<Return, SSE::short_v>::value> )
  10967. {
  10968. return _mm_setr_epi16(
  10969. x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
  10970. }
  10971. template <typename Return, typename T>
  10972. Vc_INTRINSIC Vc_CONST Return
  10973. simd_cast(Scalar::Vector<T> x0,
  10974. Scalar::Vector<T> x1,
  10975. Scalar::Vector<T> x2,
  10976. Scalar::Vector<T> x3,
  10977. enable_if<std::is_same<Return, SSE::ushort_v>::value> )
  10978. {
  10979. return _mm_setr_epi16(
  10980. x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
  10981. }
  10982. template <typename Return, typename T>
  10983. Vc_INTRINSIC Vc_CONST Return
  10984. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10985. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  10986. enable_if<std::is_same<Return, SSE::short_v>::value>)
  10987. {
  10988. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
  10989. }
  10990. template <typename Return, typename T>
  10991. Vc_INTRINSIC Vc_CONST Return
  10992. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  10993. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  10994. enable_if<std::is_same<Return, SSE::ushort_v>::value>)
  10995. {
  10996. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
  10997. }
  10998. template <typename Return, typename T>
  10999. Vc_INTRINSIC Vc_CONST Return
  11000. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  11001. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  11002. enable_if<std::is_same<Return, SSE::short_v>::value>)
  11003. {
  11004. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  11005. x5.data(), 0, 0);
  11006. }
  11007. template <typename Return, typename T>
  11008. Vc_INTRINSIC Vc_CONST Return
  11009. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  11010. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  11011. enable_if<std::is_same<Return, SSE::ushort_v>::value>)
  11012. {
  11013. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  11014. x5.data(), 0, 0);
  11015. }
  11016. template <typename Return, typename T>
  11017. Vc_INTRINSIC Vc_CONST Return
  11018. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  11019. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  11020. Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::short_v>::value>)
  11021. {
  11022. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  11023. x5.data(), x6.data(), 0);
  11024. }
  11025. template <typename Return, typename T>
  11026. Vc_INTRINSIC Vc_CONST Return
  11027. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  11028. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  11029. Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::ushort_v>::value>)
  11030. {
  11031. return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  11032. x5.data(), x6.data(), 0);
  11033. }
  11034. template <typename Return, typename T>
  11035. Vc_INTRINSIC Vc_CONST Return
  11036. simd_cast(Scalar::Vector<T> x0,
  11037. Scalar::Vector<T> x1,
  11038. Scalar::Vector<T> x2,
  11039. Scalar::Vector<T> x3,
  11040. Scalar::Vector<T> x4,
  11041. Scalar::Vector<T> x5,
  11042. Scalar::Vector<T> x6,
  11043. Scalar::Vector<T> x7,
  11044. enable_if<std::is_same<Return, SSE::short_v>::value> )
  11045. {
  11046. return _mm_setr_epi16(x0.data(),
  11047. x1.data(),
  11048. x2.data(),
  11049. x3.data(),
  11050. x4.data(),
  11051. x5.data(),
  11052. x6.data(),
  11053. x7.data());
  11054. }
  11055. template <typename Return, typename T>
  11056. Vc_INTRINSIC Vc_CONST Return
  11057. simd_cast(Scalar::Vector<T> x0,
  11058. Scalar::Vector<T> x1,
  11059. Scalar::Vector<T> x2,
  11060. Scalar::Vector<T> x3,
  11061. Scalar::Vector<T> x4,
  11062. Scalar::Vector<T> x5,
  11063. Scalar::Vector<T> x6,
  11064. Scalar::Vector<T> x7,
  11065. enable_if<std::is_same<Return, SSE::ushort_v>::value> )
  11066. {
  11067. return _mm_setr_epi16(x0.data(),
  11068. x1.data(),
  11069. x2.data(),
  11070. x3.data(),
  11071. x4.data(),
  11072. x5.data(),
  11073. x6.data(),
  11074. x7.data());
  11075. }
  11076. template <typename To, typename FromT>
  11077. Vc_INTRINSIC Vc_CONST To
  11078. simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> )
  11079. {
  11080. return static_cast<To>(x[0]);
  11081. }
  11082. template <typename Return, typename T>
  11083. Vc_INTRINSIC Vc_CONST Return
  11084. simd_cast(SSE::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
  11085. {
  11086. using M = SSE::Mask<T>;
  11087. return {Detail::mask_cast<M::Size, Return::Size, __m128>(x.dataI())};
  11088. }
  11089. template <typename Return, typename T>
  11090. Vc_INTRINSIC Vc_CONST Return simd_cast(
  11091. SSE::Mask<T> x0,
  11092. SSE::Mask<T> x1,
  11093. enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 2 == Return::Size> = nullarg)
  11094. {
  11095. return SSE::sse_cast<__m128>(_mm_packs_epi16(x0.dataI(), x1.dataI()));
  11096. }
  11097. template <typename Return, typename T>
  11098. Vc_INTRINSIC Vc_CONST Return simd_cast(
  11099. SSE::Mask<T> x0,
  11100. SSE::Mask<T> x1,
  11101. enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
  11102. {
  11103. return SSE::sse_cast<__m128>(
  11104. _mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_setzero_si128()));
  11105. }
  11106. template <typename Return, typename T>
  11107. Vc_INTRINSIC Vc_CONST Return simd_cast(
  11108. SSE::Mask<T> x0,
  11109. SSE::Mask<T> x1,
  11110. SSE::Mask<T> x2,
  11111. SSE::Mask<T> x3,
  11112. enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
  11113. {
  11114. return SSE::sse_cast<__m128>(_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()),
  11115. _mm_packs_epi16(x2.dataI(), x3.dataI())));
  11116. }
  11117. template <typename Return, typename T>
  11118. Vc_INTRINSIC Vc_CONST Return
  11119. simd_cast(Scalar::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
  11120. {
  11121. Return m(false);
  11122. m[0] = x[0];
  11123. return m;
  11124. }
  11125. template <typename Return, typename T>
  11126. Vc_INTRINSIC Vc_CONST Return
  11127. simd_cast(Scalar::Mask<T> x0, Scalar::Mask<T> x1, enable_if<SSE::is_mask<Return>::value> = nullarg)
  11128. {
  11129. Return m(false);
  11130. m[0] = x0[0];
  11131. m[1] = x1[0];
  11132. return m;
  11133. }
  11134. template <typename Return, typename T>
  11135. Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
  11136. Scalar::Mask<T> x1,
  11137. Scalar::Mask<T> x2,
  11138. Scalar::Mask<T> x3,
  11139. enable_if<SSE::is_mask<Return>::value> = nullarg)
  11140. {
  11141. Return m(false);
  11142. m[0] = x0[0];
  11143. m[1] = x1[0];
  11144. if (Return::Size >= 4) {
  11145. m[2] = x2[0];
  11146. m[3] = x3[0];
  11147. }
  11148. return m;
  11149. }
  11150. template <typename Return, typename T>
  11151. Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
  11152. Scalar::Mask<T> x1,
  11153. Scalar::Mask<T> x2,
  11154. Scalar::Mask<T> x3,
  11155. Scalar::Mask<T> x4,
  11156. Scalar::Mask<T> x5,
  11157. Scalar::Mask<T> x6,
  11158. Scalar::Mask<T> x7,
  11159. enable_if<SSE::is_mask<Return>::value> = nullarg)
  11160. {
  11161. Return m(false);
  11162. m[0] = x0[0];
  11163. m[1] = x1[0];
  11164. if (Return::Size >= 4) {
  11165. m[2] = x2[0];
  11166. m[3] = x3[0];
  11167. }
  11168. if (Return::Size >= 8) {
  11169. m[4] = x4[0];
  11170. m[5] = x5[0];
  11171. m[6] = x6[0];
  11172. m[7] = x7[0];
  11173. }
  11174. return m;
  11175. }
  11176. template <typename To, typename FromT>
  11177. Vc_INTRINSIC Vc_CONST To
  11178. simd_cast(SSE::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
  11179. {
  11180. return static_cast<To>(x[0]);
  11181. }
  11182. template <typename Return, int offset, typename V>
  11183. Vc_INTRINSIC Vc_CONST Return
  11184. simd_cast(V &&x, enable_if<offset == 0 && ((SSE::is_vector<Traits::decay<V>>::value &&
  11185. SSE::is_vector<Return>::value) ||
  11186. (SSE::is_mask<Traits::decay<V>>::value &&
  11187. SSE::is_mask<Return>::value))> = nullarg)
  11188. {
  11189. return simd_cast<Return>(x);
  11190. }
  11191. template <typename Return, int offset, typename V>
  11192. Vc_INTRINSIC Vc_CONST Return
  11193. simd_cast(V &&x,
  11194. enable_if<offset == 0 && ((Scalar::is_vector<Traits::decay<V>>::value &&
  11195. SSE::is_vector<Return>::value) ||
  11196. (Scalar::is_mask<Traits::decay<V>>::value &&
  11197. SSE::is_mask<Return>::value))> = nullarg)
  11198. {
  11199. return simd_cast<Return>(x);
  11200. }
  11201. template <typename Return, int offset, typename V>
  11202. Vc_INTRINSIC Vc_CONST Return simd_cast(
  11203. V x,
  11204. enable_if<offset != 0 && (SSE::is_vector<Return>::value && SSE::is_vector<V>::value)> = nullarg)
  11205. {
  11206. constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
  11207. static_assert(shift > 0 && shift < 16, "");
  11208. return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
  11209. _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
  11210. }
  11211. template <typename Return, int offset, typename T>
  11212. Vc_INTRINSIC Vc_CONST Return
  11213. simd_cast(SSE::Vector<T> x,
  11214. enable_if<offset != 0 && Scalar::is_vector<Return>::value> = nullarg)
  11215. {
  11216. return static_cast<typename Return::EntryType>(x[offset]);
  11217. }
  11218. template <typename Return, int offset, typename V>
  11219. Vc_INTRINSIC Vc_CONST Return simd_cast(
  11220. V x,
  11221. enable_if<offset != 0 && (SSE::is_mask<Return>::value && SSE::is_mask<V>::value)> = nullarg)
  11222. {
  11223. constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
  11224. static_assert(shift > 0 && shift < 16, "");
  11225. return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
  11226. _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
  11227. }
  11228. #undef Vc_SIMD_CAST_1
  11229. #undef Vc_SIMD_CAST_2
  11230. #undef Vc_SIMD_CAST_4
  11231. #undef Vc_SIMD_CAST_8
  11232. }
  11233. #endif
  11234. #endif
  11235. #endif
  11236. #ifdef Vc_IMPL_AVX
  11237. #ifndef VC_AVX_VECTOR_H_
  11238. #define VC_AVX_VECTOR_H_
  11239. #ifndef VC_AVX_VECTORHELPER_H_
  11240. #define VC_AVX_VECTORHELPER_H_
  11241. #include <limits>
  11242. namespace Vc_VERSIONED_NAMESPACE
  11243. {
  11244. namespace AVX
  11245. {
  11246. template<> struct VectorHelper<__m256>
  11247. {
  11248. typedef __m256 VectorType;
  11249. typedef const VectorType VTArg;
  11250. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); }
  11251. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
  11252. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); }
  11253. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
  11254. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
  11255. template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
  11256. };
  11257. template<> struct VectorHelper<__m256d>
  11258. {
  11259. typedef __m256d VectorType;
  11260. typedef const VectorType VTArg;
  11261. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); }
  11262. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
  11263. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); }
  11264. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
  11265. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
  11266. template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
  11267. };
  11268. template<> struct VectorHelper<__m256i>
  11269. {
  11270. typedef __m256i VectorType;
  11271. typedef const VectorType VTArg;
  11272. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
  11273. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
  11274. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
  11275. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
  11276. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
  11277. template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
  11278. };
  11279. #define Vc_OP1(op) \
  11280. static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
  11281. #define Vc_OP(op) \
  11282. static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
  11283. #define Vc_OP_(op) \
  11284. static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); }
  11285. #define Vc_OPx(op,op2) \
  11286. static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
  11287. template<> struct VectorHelper<double> {
  11288. typedef __m256d VectorType;
  11289. typedef const VectorType VTArg;
  11290. typedef double EntryType;
  11291. #define Vc_SUFFIX pd
  11292. static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
  11293. static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
  11294. static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
  11295. return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
  11296. }
  11297. static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
  11298. static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
  11299. static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
  11300. #ifdef Vc_IMPL_FMA4
  11301. v1 = _mm256_macc_pd(v1, v2, v3);
  11302. #else
  11303. VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
  11304. VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
  11305. #if defined(Vc_GCC) && Vc_GCC < 0x40703
  11306. asm("":"+x"(h1), "+x"(h2));
  11307. #endif
  11308. const VectorType l1 = _mm256_sub_pd(v1, h1);
  11309. const VectorType l2 = _mm256_sub_pd(v2, h2);
  11310. const VectorType ll = mul(l1, l2);
  11311. const VectorType lh = add(mul(l1, h2), mul(h1, l2));
  11312. const VectorType hh = mul(h1, h2);
  11313. const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3));
  11314. const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
  11315. const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
  11316. v1 = add(add(ll, b), add(c, hh));
  11317. #endif
  11318. }
  11319. static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
  11320. static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
  11321. static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
  11322. Vc_OP1(sqrt)
  11323. static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
  11324. return _mm256_div_pd(one(), sqrt(x));
  11325. }
  11326. static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
  11327. return _mm256_div_pd(one(), x);
  11328. }
  11329. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
  11330. return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
  11331. }
  11332. static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
  11333. static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
  11334. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
  11335. __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
  11336. b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
  11337. return _mm_cvtsd_f64(b);
  11338. }
  11339. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
  11340. __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
  11341. b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
  11342. return _mm_cvtsd_f64(b);
  11343. }
  11344. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
  11345. __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
  11346. b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
  11347. return _mm_cvtsd_f64(b);
  11348. }
  11349. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
  11350. __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
  11351. b = _mm_hadd_pd(b, b);
  11352. return _mm_cvtsd_f64(b);
  11353. }
  11354. #undef Vc_SUFFIX
  11355. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
  11356. return _mm256_round_pd(a, _MM_FROUND_NINT);
  11357. }
  11358. };
  11359. template<> struct VectorHelper<float> {
  11360. typedef float EntryType;
  11361. typedef __m256 VectorType;
  11362. typedef const VectorType VTArg;
  11363. #define Vc_SUFFIX ps
  11364. static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
  11365. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
  11366. static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
  11367. const float e, const float f, const float g, const float h) {
  11368. return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
  11369. static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
  11370. static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
  11371. static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
  11372. static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
  11373. #ifdef Vc_IMPL_FMA4
  11374. v1 = _mm256_macc_ps(v1, v2, v3);
  11375. #else
  11376. __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
  11377. __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
  11378. __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
  11379. __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
  11380. __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
  11381. __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
  11382. v1 = AVX::concat(
  11383. _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
  11384. _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
  11385. #endif
  11386. }
  11387. static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
  11388. static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
  11389. static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
  11390. Vc_OP1(sqrt) Vc_OP1(rsqrt)
  11391. static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
  11392. return _mm256_rcp_ps(x);
  11393. }
  11394. static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
  11395. return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
  11396. }
  11397. static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
  11398. static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
  11399. static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
  11400. __m128 b = _mm_min_ps(lo128(a), hi128(a));
  11401. b = _mm_min_ps(b, _mm_movehl_ps(b, b));
  11402. b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
  11403. return _mm_cvtss_f32(b);
  11404. }
  11405. static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
  11406. __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
  11407. b = _mm_max_ps(b, _mm_movehl_ps(b, b));
  11408. b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
  11409. return _mm_cvtss_f32(b);
  11410. }
  11411. static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
  11412. __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
  11413. b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
  11414. b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
  11415. return _mm_cvtss_f32(b);
  11416. }
  11417. static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
  11418. __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
  11419. b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
  11420. b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
  11421. return _mm_cvtss_f32(b);
  11422. }
  11423. #undef Vc_SUFFIX
  11424. static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
  11425. return _mm256_round_ps(a, _MM_FROUND_NINT);
  11426. }
  11427. };
  11428. #undef Vc_OP1
  11429. #undef Vc_OP
  11430. #undef Vc_OP_
  11431. #undef Vc_OPx
  11432. }
  11433. }
  11434. #endif
  11435. #ifndef VC_AVX_MASK_H_
  11436. #define VC_AVX_MASK_H_
  11437. #include <array>
  11438. #ifndef VC_AVX_DETAIL_H_
  11439. #define VC_AVX_DETAIL_H_
  11440. namespace Vc_VERSIONED_NAMESPACE
  11441. {
  11442. namespace Detail
  11443. {
  11444. template <typename Flags>
  11445. Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
  11446. typename Flags::EnableIfAligned = nullptr)
  11447. {
  11448. return _mm256_load_ps(x);
  11449. }
  11450. template <typename Flags>
  11451. Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
  11452. typename Flags::EnableIfUnaligned = nullptr)
  11453. {
  11454. return _mm256_loadu_ps(x);
  11455. }
  11456. template <typename Flags>
  11457. Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
  11458. typename Flags::EnableIfStreaming = nullptr)
  11459. {
  11460. return AvxIntrinsics::stream_load<__m256>(x);
  11461. }
  11462. template <typename Flags>
  11463. Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
  11464. typename Flags::EnableIfAligned = nullptr)
  11465. {
  11466. return _mm256_load_pd(x);
  11467. }
  11468. template <typename Flags>
  11469. Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
  11470. typename Flags::EnableIfUnaligned = nullptr)
  11471. {
  11472. return _mm256_loadu_pd(x);
  11473. }
  11474. template <typename Flags>
  11475. Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
  11476. typename Flags::EnableIfStreaming = nullptr)
  11477. {
  11478. return AvxIntrinsics::stream_load<__m256d>(x);
  11479. }
  11480. template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
  11481. Vc_INTRINSIC Vc_PURE __m256i
  11482. load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
  11483. {
  11484. return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
  11485. }
  11486. template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
  11487. Vc_INTRINSIC Vc_PURE __m256i
  11488. load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
  11489. {
  11490. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
  11491. }
  11492. template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
  11493. Vc_INTRINSIC Vc_PURE __m256i
  11494. load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
  11495. {
  11496. return AvxIntrinsics::stream_load<__m256i>(x);
  11497. }
  11498. Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
  11499. {
  11500. return _mm256_load_ps(mem);
  11501. }
  11502. Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
  11503. {
  11504. return _mm256_loadu_ps(mem);
  11505. }
  11506. Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
  11507. {
  11508. return AvxIntrinsics::stream_load<__m256>(mem);
  11509. }
  11510. Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
  11511. {
  11512. return _mm256_load_pd(mem);
  11513. }
  11514. Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
  11515. {
  11516. return _mm256_loadu_pd(mem);
  11517. }
  11518. Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
  11519. {
  11520. return AvxIntrinsics::stream_load<__m256d>(mem);
  11521. }
  11522. template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
  11523. {
  11524. static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
  11525. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11526. }
  11527. template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
  11528. {
  11529. static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
  11530. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
  11531. }
  11532. template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
  11533. {
  11534. static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
  11535. return AvxIntrinsics::stream_load<__m256i>(mem);
  11536. }
  11537. #ifdef Vc_MSVC
  11538. Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
  11539. {
  11540. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11541. }
  11542. Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
  11543. {
  11544. return _mm256_loadu_pd(mem);
  11545. }
  11546. template <typename V, typename DstT>
  11547. Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
  11548. enable_if<(std::is_same<DstT, float>::value &&
  11549. std::is_same<V, __m256>::value)> = nullarg)
  11550. {
  11551. return _mm256_load_ps(mem);
  11552. }
  11553. template <typename V, typename DstT>
  11554. Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
  11555. enable_if<(std::is_same<DstT, float>::value &&
  11556. std::is_same<V, __m256>::value)> = nullarg)
  11557. {
  11558. return _mm256_loadu_ps(mem);
  11559. }
  11560. template <typename V, typename DstT>
  11561. Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
  11562. enable_if<(std::is_same<DstT, float>::value &&
  11563. std::is_same<V, __m256>::value)> = nullarg)
  11564. {
  11565. return AvxIntrinsics::stream_load<__m256>(mem);
  11566. }
  11567. template <typename V, typename DstT>
  11568. Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
  11569. enable_if<(std::is_same<DstT, double>::value &&
  11570. std::is_same<V, __m256d>::value)> = nullarg)
  11571. {
  11572. return _mm256_load_pd(mem);
  11573. }
  11574. template <typename V, typename DstT>
  11575. Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
  11576. enable_if<(std::is_same<DstT, double>::value &&
  11577. std::is_same<V, __m256d>::value)> = nullarg)
  11578. {
  11579. return _mm256_loadu_pd(mem);
  11580. }
  11581. template <typename V, typename DstT>
  11582. Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
  11583. enable_if<(std::is_same<DstT, double>::value &&
  11584. std::is_same<V, __m256d>::value)> = nullarg)
  11585. {
  11586. return AvxIntrinsics::stream_load<__m256d>(mem);
  11587. }
  11588. template <typename V, typename DstT>
  11589. Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
  11590. enable_if<(std::is_same<DstT, uint>::value &&
  11591. std::is_same<V, __m256i>::value)> = nullarg)
  11592. {
  11593. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11594. }
  11595. template <typename V, typename DstT>
  11596. Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
  11597. enable_if<(std::is_same<DstT, uint>::value &&
  11598. std::is_same<V, __m256i>::value)> = nullarg)
  11599. {
  11600. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
  11601. }
  11602. template <typename V, typename DstT>
  11603. Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
  11604. enable_if<(std::is_same<DstT, uint>::value &&
  11605. std::is_same<V, __m256i>::value)> = nullarg)
  11606. {
  11607. return AvxIntrinsics::stream_load<__m256i>(mem);
  11608. }
  11609. template <typename V, typename DstT>
  11610. Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
  11611. enable_if<(std::is_same<DstT, int>::value &&
  11612. std::is_same<V, __m256i>::value)> = nullarg)
  11613. {
  11614. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
  11615. }
  11616. template <typename V, typename DstT>
  11617. Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
  11618. enable_if<(std::is_same<DstT, int>::value &&
  11619. std::is_same<V, __m256i>::value)> = nullarg)
  11620. {
  11621. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11622. }
  11623. template <typename V, typename DstT>
  11624. Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
  11625. enable_if<(std::is_same<DstT, int>::value &&
  11626. std::is_same<V, __m256i>::value)> = nullarg)
  11627. {
  11628. return AvxIntrinsics::stream_load<__m256i>(mem);
  11629. }
  11630. template <typename V, typename DstT>
  11631. Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
  11632. enable_if<(std::is_same<DstT, short>::value &&
  11633. std::is_same<V, __m256i>::value)> = nullarg)
  11634. {
  11635. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
  11636. }
  11637. template <typename V, typename DstT>
  11638. Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
  11639. enable_if<(std::is_same<DstT, short>::value &&
  11640. std::is_same<V, __m256i>::value)> = nullarg)
  11641. {
  11642. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11643. }
  11644. template <typename V, typename DstT>
  11645. Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
  11646. enable_if<(std::is_same<DstT, short>::value &&
  11647. std::is_same<V, __m256i>::value)> = nullarg)
  11648. {
  11649. return AvxIntrinsics::stream_load<__m256i>(mem);
  11650. }
  11651. template <typename V, typename DstT>
  11652. Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
  11653. enable_if<(std::is_same<DstT, ushort>::value &&
  11654. std::is_same<V, __m256i>::value)> = nullarg)
  11655. {
  11656. return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
  11657. }
  11658. template <typename V, typename DstT>
  11659. Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
  11660. enable_if<(std::is_same<DstT, ushort>::value &&
  11661. std::is_same<V, __m256i>::value)> = nullarg)
  11662. {
  11663. return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
  11664. }
  11665. template <typename V, typename DstT>
  11666. Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
  11667. enable_if<(std::is_same<DstT, ushort>::value &&
  11668. std::is_same<V, __m256i>::value)> = nullarg)
  11669. {
  11670. return AvxIntrinsics::stream_load<__m256i>(mem);
  11671. }
  11672. #endif
  11673. template <typename Flags>
  11674. Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
  11675. {
  11676. return load32(mem, f);
  11677. }
  11678. template <typename Flags>
  11679. Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
  11680. {
  11681. return AVX::cvtepu8_epi16(load16(mem, f));
  11682. }
  11683. template <typename Flags>
  11684. Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
  11685. {
  11686. return AVX::cvtepi8_epi16(load16(mem, f));
  11687. }
  11688. template <typename Flags>
  11689. Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
  11690. {
  11691. return AVX::cvtepu8_epi16(load16(mem, f));
  11692. }
  11693. template <typename Flags>
  11694. Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
  11695. {
  11696. return load32(mem, f);
  11697. }
  11698. template <typename Flags>
  11699. Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
  11700. {
  11701. return AVX::cvtepu16_epi32(load16(mem, f));
  11702. }
  11703. template <typename Flags>
  11704. Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
  11705. {
  11706. return AVX::cvtepi16_epi32(load16(mem, f));
  11707. }
  11708. template <typename Flags>
  11709. Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
  11710. {
  11711. return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  11712. }
  11713. template <typename Flags>
  11714. Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
  11715. {
  11716. return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  11717. }
  11718. template <typename Flags>
  11719. Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
  11720. {
  11721. return AVX::cvtepu16_epi32(load16(mem, f));
  11722. }
  11723. template <typename Flags>
  11724. Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
  11725. {
  11726. return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
  11727. }
  11728. template <typename Flags>
  11729. Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
  11730. {
  11731. return AVX::convert<float, double>(load16(mem, f));
  11732. }
  11733. template <typename Flags>
  11734. Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
  11735. {
  11736. return AVX::convert<uint, double>(load16(mem, f));
  11737. }
  11738. template <typename Flags>
  11739. Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
  11740. {
  11741. return AVX::convert<int, double>(load16(mem, f));
  11742. }
  11743. template <typename Flags>
  11744. Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
  11745. {
  11746. return AVX::convert<int, double>(load16(mem, f));
  11747. }
  11748. template <typename Flags>
  11749. Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
  11750. {
  11751. return AVX::convert<int, double>(load16(mem, f));
  11752. }
  11753. template <typename Flags>
  11754. Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
  11755. {
  11756. return AVX::convert<int, double>(load16(mem, f));
  11757. }
  11758. template <typename Flags>
  11759. Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
  11760. {
  11761. return AVX::convert<int, double>(load16(mem, f));
  11762. }
  11763. template <typename Flags>
  11764. Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
  11765. {
  11766. return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
  11767. _mm256_cvtpd_ps(load32(&mem[4], f)));
  11768. }
  11769. template <typename Flags>
  11770. Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
  11771. {
  11772. const auto v = load32(mem, f);
  11773. return _mm256_blendv_ps(
  11774. _mm256_cvtepi32_ps(v),
  11775. _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
  11776. AVX::set2power31_ps()),
  11777. _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
  11778. }
  11779. template <typename Flags>
  11780. Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
  11781. {
  11782. return AVX::convert<int, float>(load32(mem, f));
  11783. }
  11784. template <typename T, typename Flags,
  11785. typename = enable_if<!std::is_same<T, float>::value>>
  11786. Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
  11787. {
  11788. return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
  11789. }
  11790. template <typename Flags>
  11791. Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
  11792. {
  11793. return AVX::convert<ushort, float>(load16(mem, f));
  11794. }
  11795. template <typename Flags>
  11796. Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
  11797. {
  11798. return AVX::convert<short, float>(load16(mem, f));
  11799. }
  11800. template <int amount, typename T>
  11801. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
  11802. {
  11803. return AVX::avx_cast<T>(AVX::zeroExtend(
  11804. _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
  11805. }
  11806. template <int amount, typename T>
  11807. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
  11808. shifted(T k)
  11809. {
  11810. return AVX::avx_cast<T>(
  11811. AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
  11812. AVX::avx_cast<__m256i>(k)));
  11813. }
  11814. template <int amount, typename T>
  11815. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
  11816. {
  11817. return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
  11818. _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
  11819. }
  11820. template <int amount, typename T>
  11821. Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
  11822. shifted(T k)
  11823. {
  11824. return AVX::avx_cast<T>(
  11825. AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
  11826. Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
  11827. }
  11828. template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
  11829. {
  11830. static_assert(From == To, "Incorrect mask cast.");
  11831. static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
  11832. return AVX::avx_cast<__m256>(k);
  11833. }
  11834. template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
  11835. {
  11836. return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
  11837. }
  11838. template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
  11839. {
  11840. const auto kk = _mm_castsi128_ps(k);
  11841. return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
  11842. }
  11843. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
  11844. {
  11845. return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
  11846. _mm_setzero_si128()));
  11847. }
  11848. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
  11849. {
  11850. return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
  11851. }
  11852. template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
  11853. {
  11854. return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
  11855. }
  11856. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
  11857. {
  11858. return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
  11859. }
  11860. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
  11861. {
  11862. const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
  11863. return AVX::concat(_mm_unpacklo_ps(lo, lo),
  11864. _mm_unpackhi_ps(lo, lo));
  11865. }
  11866. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
  11867. {
  11868. return AVX::avx_cast<__m128>(AVX::lo128(k));
  11869. }
  11870. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
  11871. {
  11872. const auto tmp = _mm_unpacklo_epi16(k, k);
  11873. return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp),
  11874. _mm_unpackhi_epi32(tmp, tmp)));
  11875. }
  11876. template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
  11877. {
  11878. return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
  11879. }
  11880. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
  11881. {
  11882. return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
  11883. _mm_unpackhi_epi16(k, k)));
  11884. }
  11885. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
  11886. {
  11887. return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
  11888. }
  11889. #ifdef Vc_IMPL_AVX2
  11890. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
  11891. {
  11892. const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
  11893. return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
  11894. }
  11895. #endif
  11896. template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
  11897. {
  11898. const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k));
  11899. return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
  11900. }
  11901. template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); }
  11902. template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
  11903. template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
  11904. template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); }
  11905. template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
  11906. template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
  11907. Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); }
  11908. Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); }
  11909. Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); }
  11910. Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); }
  11911. Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
  11912. Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
  11913. Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
  11914. Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
  11915. Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
  11916. {
  11917. return _mm256_xor_ps(v, AVX::setsignmask_ps());
  11918. }
  11919. Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
  11920. {
  11921. return _mm256_xor_pd(v, AVX::setsignmask_pd());
  11922. }
  11923. Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
  11924. {
  11925. return AVX::sign_epi32(v, Detail::allone<__m256i>());
  11926. }
  11927. Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
  11928. {
  11929. return AVX::sign_epi16(v, Detail::allone<__m256i>());
  11930. }
  11931. Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
  11932. Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
  11933. Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
  11934. {
  11935. #ifdef Vc_IMPL_AVX2
  11936. return _mm256_xor_si256(a, b);
  11937. #else
  11938. return _mm256_castps_si256(
  11939. _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
  11940. #endif
  11941. }
  11942. Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
  11943. Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
  11944. Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
  11945. {
  11946. #ifdef Vc_IMPL_AVX2
  11947. return _mm256_or_si256(a, b);
  11948. #else
  11949. return _mm256_castps_si256(
  11950. _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
  11951. #endif
  11952. }
  11953. Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
  11954. Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
  11955. Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
  11956. #ifdef Vc_IMPL_AVX2
  11957. return _mm256_and_si256(a, b);
  11958. #else
  11959. return _mm256_castps_si256(
  11960. _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
  11961. #endif
  11962. }
  11963. Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
  11964. Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
  11965. Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
  11966. {
  11967. #ifdef Vc_IMPL_AVX2
  11968. return _mm256_andnot_si256(a, b);
  11969. #else
  11970. return _mm256_castps_si256(
  11971. _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
  11972. #endif
  11973. }
  11974. Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); }
  11975. Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
  11976. Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
  11977. Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); }
  11978. Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
  11979. Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
  11980. Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); }
  11981. Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
  11982. Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); }
  11983. Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; }
  11984. Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); }
  11985. Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
  11986. Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); }
  11987. Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; }
  11988. Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); }
  11989. Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
  11990. Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); }
  11991. Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
  11992. Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
  11993. Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
  11994. Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
  11995. Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
  11996. Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); }
  11997. Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
  11998. Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
  11999. Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
  12000. Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
  12001. Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
  12002. Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); }
  12003. Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); }
  12004. Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); }
  12005. Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
  12006. Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); }
  12007. Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
  12008. Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) {
  12009. using namespace AVX;
  12010. const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
  12011. const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
  12012. const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
  12013. const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
  12014. return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
  12015. _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
  12016. }
  12017. Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) {
  12018. using namespace AVX;
  12019. const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
  12020. const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
  12021. const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
  12022. const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
  12023. const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
  12024. const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
  12025. return avx_cast<__m256i>(_mm256_blendv_ps(
  12026. avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
  12027. _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
  12028. avx_cast<__m256>(a),
  12029. avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
  12030. }
  12031. Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) {
  12032. using namespace AVX;
  12033. const __m256 lo =
  12034. _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
  12035. const __m256 hi =
  12036. _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
  12037. return concat(convert<float, short>(lo), convert<float, short>(hi));
  12038. }
  12039. template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
  12040. {
  12041. return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
  12042. }
  12043. template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
  12044. {
  12045. return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
  12046. }
  12047. template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
  12048. {
  12049. return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
  12050. }
  12051. template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
  12052. {
  12053. return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
  12054. }
  12055. Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); }
  12056. Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
  12057. Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
  12058. Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
  12059. Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
  12060. Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
  12061. Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
  12062. Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
  12063. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
  12064. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
  12065. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
  12066. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
  12067. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
  12068. Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
  12069. Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); }
  12070. Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
  12071. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); }
  12072. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); }
  12073. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); }
  12074. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
  12075. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); }
  12076. Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); }
  12077. Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); }
  12078. Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
  12079. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); }
  12080. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); }
  12081. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); }
  12082. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
  12083. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
  12084. Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
  12085. Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); }
  12086. Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
  12087. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); }
  12088. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); }
  12089. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); }
  12090. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
  12091. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
  12092. Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
  12093. Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); }
  12094. Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
  12095. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); }
  12096. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); }
  12097. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); }
  12098. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
  12099. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); }
  12100. Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); }
  12101. Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) {
  12102. #ifdef Vc_IMPL_FMA4
  12103. return _mm256_macc_ps(a, b, c);
  12104. #elif defined Vc_IMPL_FMA
  12105. return _mm256_fmadd_ps(a, b, c);
  12106. #else
  12107. using namespace AVX;
  12108. __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
  12109. __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
  12110. __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
  12111. __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
  12112. __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
  12113. __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
  12114. return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
  12115. _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
  12116. #endif
  12117. }
  12118. Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
  12119. {
  12120. #ifdef Vc_IMPL_FMA4
  12121. return _mm256_macc_pd(a, b, c);
  12122. #elif defined Vc_IMPL_FMA
  12123. return _mm256_fmadd_pd(a, b, c);
  12124. #else
  12125. using namespace AVX;
  12126. __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
  12127. &c_general::highMaskDouble)));
  12128. __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
  12129. &c_general::highMaskDouble)));
  12130. const __m256d l1 = _mm256_sub_pd(a, h1);
  12131. const __m256d l2 = _mm256_sub_pd(b, h2);
  12132. const __m256d ll = mul(l1, l2, double());
  12133. const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
  12134. const __m256d hh = mul(h1, h2, double());
  12135. const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());
  12136. const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
  12137. const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
  12138. return add(add(ll, x, double()), add(y, hh, double()), double());
  12139. #endif
  12140. }
  12141. template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
  12142. {
  12143. return add(mul(a, b, T()), c, T());
  12144. }
  12145. template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32<shift>(a); }
  12146. template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32<shift>(a); }
  12147. template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16<shift>(a); }
  12148. template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
  12149. Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
  12150. Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
  12151. Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
  12152. Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
  12153. template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32<shift>(a); }
  12154. template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32<shift>(a); }
  12155. template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16<shift>(a); }
  12156. template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
  12157. Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
  12158. Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
  12159. Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
  12160. Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
  12161. Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; }
  12162. Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
  12163. Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
  12164. Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); }
  12165. Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
  12166. Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
  12167. Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); }
  12168. Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
  12169. Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); }
  12170. Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); }
  12171. Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
  12172. Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
  12173. Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); }
  12174. Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
  12175. Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
  12176. template <Vc::Implementation Impl, typename T,
  12177. typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
  12178. Vc_CONST_L AVX2::Vector<T> sorted(AVX2::Vector<T> x) Vc_CONST_R;
  12179. template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
  12180. {
  12181. return sorted<CurrentImplementation::current()>(x);
  12182. }
  12183. template <typename T, typename V>
  12184. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
  12185. {
  12186. using namespace AVX;
  12187. constexpr int S = sizeof(T);
  12188. switch (amount) {
  12189. case 0: return v;
  12190. case 1: return shifted<sanitize<V>( 1 * S)>(v);
  12191. case 2: return shifted<sanitize<V>( 2 * S)>(v);
  12192. case 3: return shifted<sanitize<V>( 3 * S)>(v);
  12193. case -1: return shifted<sanitize<V>(-1 * S)>(v);
  12194. case -2: return shifted<sanitize<V>(-2 * S)>(v);
  12195. case -3: return shifted<sanitize<V>(-3 * S)>(v);
  12196. }
  12197. if (sizeof(T) <= 4) {
  12198. switch (amount) {
  12199. case 4: return shifted<sanitize<V>( 4 * S)>(v);
  12200. case 5: return shifted<sanitize<V>( 5 * S)>(v);
  12201. case 6: return shifted<sanitize<V>( 6 * S)>(v);
  12202. case 7: return shifted<sanitize<V>( 7 * S)>(v);
  12203. case -4: return shifted<sanitize<V>(-4 * S)>(v);
  12204. case -5: return shifted<sanitize<V>(-5 * S)>(v);
  12205. case -6: return shifted<sanitize<V>(-6 * S)>(v);
  12206. case -7: return shifted<sanitize<V>(-7 * S)>(v);
  12207. }
  12208. if (sizeof(T) <= 2) {
  12209. switch (amount) {
  12210. case 8: return shifted<sanitize<V>( 8 * S)>(v);
  12211. case 9: return shifted<sanitize<V>( 9 * S)>(v);
  12212. case 10: return shifted<sanitize<V>( 10 * S)>(v);
  12213. case 11: return shifted<sanitize<V>( 11 * S)>(v);
  12214. case 12: return shifted<sanitize<V>( 12 * S)>(v);
  12215. case 13: return shifted<sanitize<V>( 13 * S)>(v);
  12216. case 14: return shifted<sanitize<V>( 14 * S)>(v);
  12217. case 15: return shifted<sanitize<V>( 15 * S)>(v);
  12218. case -8: return shifted<sanitize<V>(- 8 * S)>(v);
  12219. case -9: return shifted<sanitize<V>(- 9 * S)>(v);
  12220. case -10: return shifted<sanitize<V>(-10 * S)>(v);
  12221. case -11: return shifted<sanitize<V>(-11 * S)>(v);
  12222. case -12: return shifted<sanitize<V>(-12 * S)>(v);
  12223. case -13: return shifted<sanitize<V>(-13 * S)>(v);
  12224. case -14: return shifted<sanitize<V>(-14 * S)>(v);
  12225. case -15: return shifted<sanitize<V>(-15 * S)>(v);
  12226. }
  12227. if (sizeof(T) == 1) {
  12228. switch (amount) {
  12229. case 16: return shifted<sanitize<V>( 16)>(v);
  12230. case 17: return shifted<sanitize<V>( 17)>(v);
  12231. case 18: return shifted<sanitize<V>( 18)>(v);
  12232. case 19: return shifted<sanitize<V>( 19)>(v);
  12233. case 20: return shifted<sanitize<V>( 20)>(v);
  12234. case 21: return shifted<sanitize<V>( 21)>(v);
  12235. case 22: return shifted<sanitize<V>( 22)>(v);
  12236. case 23: return shifted<sanitize<V>( 23)>(v);
  12237. case 24: return shifted<sanitize<V>( 24)>(v);
  12238. case 25: return shifted<sanitize<V>( 25)>(v);
  12239. case 26: return shifted<sanitize<V>( 26)>(v);
  12240. case 27: return shifted<sanitize<V>( 27)>(v);
  12241. case 28: return shifted<sanitize<V>( 28)>(v);
  12242. case 29: return shifted<sanitize<V>( 29)>(v);
  12243. case 30: return shifted<sanitize<V>( 30)>(v);
  12244. case 31: return shifted<sanitize<V>( 31)>(v);
  12245. case -16: return shifted<sanitize<V>(-16)>(v);
  12246. case -17: return shifted<sanitize<V>(-17)>(v);
  12247. case -18: return shifted<sanitize<V>(-18)>(v);
  12248. case -19: return shifted<sanitize<V>(-19)>(v);
  12249. case -20: return shifted<sanitize<V>(-20)>(v);
  12250. case -21: return shifted<sanitize<V>(-21)>(v);
  12251. case -22: return shifted<sanitize<V>(-22)>(v);
  12252. case -23: return shifted<sanitize<V>(-23)>(v);
  12253. case -24: return shifted<sanitize<V>(-24)>(v);
  12254. case -25: return shifted<sanitize<V>(-25)>(v);
  12255. case -26: return shifted<sanitize<V>(-26)>(v);
  12256. case -27: return shifted<sanitize<V>(-27)>(v);
  12257. case -28: return shifted<sanitize<V>(-28)>(v);
  12258. case -29: return shifted<sanitize<V>(-29)>(v);
  12259. case -30: return shifted<sanitize<V>(-30)>(v);
  12260. case -31: return shifted<sanitize<V>(-31)>(v);
  12261. }
  12262. }
  12263. }
  12264. }
  12265. return avx_cast<V>(_mm256_setzero_ps());
  12266. }
  12267. template <typename T, typename V>
  12268. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
  12269. {
  12270. using namespace AVX;
  12271. switch (amount) {
  12272. case 0: return v;
  12273. case 1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
  12274. case 2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
  12275. case 3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
  12276. case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
  12277. case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
  12278. case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
  12279. }
  12280. if (sizeof(T) <= 2) {
  12281. switch (amount) {
  12282. case 4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
  12283. case 5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
  12284. case 6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
  12285. case 7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
  12286. case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
  12287. case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
  12288. case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
  12289. case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
  12290. }
  12291. }
  12292. return avx_cast<V>(_mm_setzero_ps());
  12293. }
  12294. template <typename T, size_t N, typename V>
  12295. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
  12296. int amount)
  12297. {
  12298. using namespace AVX;
  12299. const __m128i vLo = avx_cast<__m128i>(lo128(v));
  12300. const __m128i vHi = avx_cast<__m128i>(hi128(v));
  12301. switch (static_cast<unsigned int>(amount) % N) {
  12302. case 0:
  12303. return v;
  12304. case 1:
  12305. return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
  12306. SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
  12307. case 2:
  12308. return Mem::permute128<X1, X0>(v);
  12309. case 3:
  12310. return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
  12311. SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
  12312. }
  12313. return avx_cast<V>(_mm256_setzero_ps());
  12314. }
  12315. template <typename T, size_t N, typename V>
  12316. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
  12317. int amount)
  12318. {
  12319. using namespace AVX;
  12320. const __m128i vLo = avx_cast<__m128i>(lo128(v));
  12321. const __m128i vHi = avx_cast<__m128i>(hi128(v));
  12322. switch (static_cast<unsigned int>(amount) % N) {
  12323. case 0:
  12324. return v;
  12325. case 1:
  12326. return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
  12327. SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
  12328. case 2:
  12329. return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
  12330. SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
  12331. case 3:
  12332. return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
  12333. SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
  12334. case 4:
  12335. return Mem::permute128<X1, X0>(v);
  12336. case 5:
  12337. return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
  12338. SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
  12339. case 6:
  12340. return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
  12341. SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
  12342. case 7:
  12343. return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
  12344. SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
  12345. }
  12346. return avx_cast<V>(_mm256_setzero_ps());
  12347. }
  12348. #ifdef Vc_IMPL_AVX2
  12349. template <typename T, size_t N, typename V>
  12350. static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
  12351. V v, int amount)
  12352. {
  12353. using namespace AVX;
  12354. const __m128i vLo = avx_cast<__m128i>(lo128(v));
  12355. const __m128i vHi = avx_cast<__m128i>(hi128(v));
  12356. switch (static_cast<unsigned int>(amount) % N) {
  12357. case 0:
  12358. return v;
  12359. case 1:
  12360. return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
  12361. SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
  12362. case 2:
  12363. return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
  12364. SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
  12365. case 3:
  12366. return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
  12367. SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
  12368. case 4:
  12369. return Mem::permute4x64<X1, X2, X3, X0>(v);
  12370. case 5:
  12371. return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
  12372. SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
  12373. case 6:
  12374. return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
  12375. SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
  12376. case 7:
  12377. return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
  12378. SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
  12379. case 8:
  12380. return Mem::permute128<X1, X0>(v);
  12381. case 9:
  12382. return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
  12383. SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
  12384. case 10:
  12385. return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
  12386. SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
  12387. case 11:
  12388. return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
  12389. SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
  12390. case 12:
  12391. return Mem::permute4x64<X3, X0, X1, X2>(v);
  12392. case 13:
  12393. return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
  12394. SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
  12395. case 14:
  12396. return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
  12397. SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
  12398. case 15:
  12399. return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
  12400. SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
  12401. }
  12402. return avx_cast<V>(_mm256_setzero_ps());
  12403. }
  12404. #endif
  12405. Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
  12406. Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); }
  12407. Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
  12408. Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
  12409. Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
  12410. Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); }
  12411. Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
  12412. Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
  12413. Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
  12414. Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); }
  12415. Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
  12416. Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
  12417. Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
  12418. Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
  12419. Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
  12420. Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
  12421. Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); }
  12422. Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); }
  12423. template <size_t N, typename Flags>
  12424. Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
  12425. {
  12426. static_assert(
  12427. N == 4 || N == 8 || N == 16,
  12428. "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
  12429. switch (N) {
  12430. case 4:
  12431. *aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
  12432. (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
  12433. 0x01010101;
  12434. break;
  12435. case 8: {
  12436. const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
  12437. const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
  12438. #ifdef __x86_64__
  12439. *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
  12440. #else
  12441. *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
  12442. *aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
  12443. #endif
  12444. } break;
  12445. case 16: {
  12446. const auto bools = Detail::and_(_mm_set1_epi8(1),
  12447. _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
  12448. if (Flags::IsAligned) {
  12449. _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
  12450. } else {
  12451. _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
  12452. }
  12453. } break;
  12454. default:
  12455. Vc_UNREACHABLE();
  12456. }
  12457. }
  12458. template <typename R, size_t N, typename Flags>
  12459. Vc_INTRINSIC R mask_load(const bool *mem, Flags,
  12460. enable_if<std::is_same<R, __m128>::value> = nullarg)
  12461. {
  12462. static_assert(N == 4 || N == 8,
  12463. "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
  12464. switch (N) {
  12465. case 4: {
  12466. __m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
  12467. k = _mm_unpacklo_epi8(k, k);
  12468. k = _mm_unpacklo_epi16(k, k);
  12469. k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
  12470. return AVX::avx_cast<__m128>(k);
  12471. }
  12472. case 8: {
  12473. #ifdef __x86_64__
  12474. __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
  12475. #else
  12476. __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
  12477. #endif
  12478. return AVX::avx_cast<__m128>(
  12479. _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
  12480. }
  12481. default:
  12482. Vc_UNREACHABLE();
  12483. }
  12484. }
  12485. template <typename R, size_t N, typename Flags>
  12486. Vc_INTRINSIC R mask_load(const bool *mem, Flags,
  12487. enable_if<std::is_same<R, __m256>::value> = nullarg)
  12488. {
  12489. static_assert(
  12490. N == 4 || N == 8 || N == 16,
  12491. "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
  12492. switch (N) {
  12493. case 4: {
  12494. __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
  12495. _mm_set1_ps(*aliasing_cast<float>(mem)),
  12496. AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
  12497. k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
  12498. return AVX::avx_cast<__m256>(
  12499. AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
  12500. }
  12501. case 8: {
  12502. #ifdef __x86_64__
  12503. __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
  12504. #else
  12505. __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
  12506. #endif
  12507. k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
  12508. return AVX::avx_cast<__m256>(
  12509. AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
  12510. }
  12511. case 16: {
  12512. const auto k128 = _mm_cmpgt_epi8(
  12513. Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
  12514. : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
  12515. _mm_setzero_si128());
  12516. return AVX::avx_cast<__m256>(
  12517. AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
  12518. }
  12519. default:
  12520. Vc_UNREACHABLE();
  12521. return R();
  12522. }
  12523. }
  12524. template <size_t Size>
  12525. Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
  12526. template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
  12527. {
  12528. return movemask(AVX::avx_cast<__m256d>(k));
  12529. }
  12530. template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
  12531. {
  12532. return movemask(AVX::avx_cast<__m256>(k));
  12533. }
  12534. #ifdef Vc_IMPL_BMI2
  12535. template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
  12536. {
  12537. return _pext_u32(movemask(k), 0x55555555u);
  12538. }
  12539. #endif
  12540. template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
  12541. {
  12542. return movemask(k);
  12543. }
  12544. template<typename V> struct InterleaveImpl<V, 16, 32> {
  12545. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  12546. const typename V::AsArg v0,
  12547. const typename V::AsArg v1)
  12548. {
  12549. const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
  12550. const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
  12551. using namespace AVX;
  12552. *aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
  12553. *aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
  12554. *aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
  12555. *aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
  12556. *aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
  12557. *aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
  12558. *aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
  12559. *aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
  12560. *aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
  12561. *aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
  12562. *aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
  12563. *aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
  12564. *aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
  12565. *aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
  12566. *aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
  12567. *aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
  12568. }
  12569. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
  12570. const typename V::AsArg v0, const typename V::AsArg v1)
  12571. {
  12572. const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
  12573. const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
  12574. V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
  12575. V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
  12576. }
  12577. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  12578. const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
  12579. {
  12580. interleave(data, i, v0, v1);
  12581. v2.scatter(data + 2, i);
  12582. }
  12583. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  12584. const typename V::AsArg v0, const typename V::AsArg v1,
  12585. const typename V::AsArg v2, const typename V::AsArg v3)
  12586. {
  12587. const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
  12588. const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
  12589. const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
  12590. const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
  12591. const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
  12592. const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
  12593. const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
  12594. const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
  12595. using namespace AVX;
  12596. auto &&store = [&](__m256i x, int offset) {
  12597. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
  12598. _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
  12599. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
  12600. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
  12601. };
  12602. store(tmp4, 0);
  12603. store(tmp5, 2);
  12604. store(tmp6, 4);
  12605. store(tmp7, 6);
  12606. }
  12607. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
  12608. const typename V::AsArg v0, const typename V::AsArg v1,
  12609. const typename V::AsArg v2, const typename V::AsArg v3)
  12610. {
  12611. const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
  12612. const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
  12613. const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
  12614. const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
  12615. const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
  12616. const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
  12617. const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
  12618. const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
  12619. V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
  12620. V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
  12621. V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
  12622. V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
  12623. }
  12624. template <typename I>
  12625. static inline void interleave(typename V::EntryType *const data, const I &i,
  12626. const typename V::AsArg v0, const typename V::AsArg v1,
  12627. const typename V::AsArg v2, const typename V::AsArg v3,
  12628. const typename V::AsArg v4)
  12629. {
  12630. interleave(data, i, v0, v1, v2, v3);
  12631. v4.scatter(data + 4, i);
  12632. }
  12633. template <typename I>
  12634. static inline void interleave(typename V::EntryType *const data, const I &i,
  12635. const typename V::AsArg v0, const typename V::AsArg v1,
  12636. const typename V::AsArg v2, const typename V::AsArg v3,
  12637. const typename V::AsArg v4, const typename V::AsArg v5)
  12638. {
  12639. interleave(data, i, v0, v1, v2, v3);
  12640. interleave(data + 4, i, v4, v5);
  12641. }
  12642. template <typename I>
  12643. static inline void interleave(typename V::EntryType *const data, const I &i,
  12644. const typename V::AsArg v0, const typename V::AsArg v1,
  12645. const typename V::AsArg v2, const typename V::AsArg v3,
  12646. const typename V::AsArg v4, const typename V::AsArg v5,
  12647. const typename V::AsArg v6)
  12648. {
  12649. interleave(data, i, v0, v1, v2, v3);
  12650. interleave(data + 4, i, v4, v5, v6);
  12651. }
  12652. template <typename I>
  12653. static inline void interleave(typename V::EntryType *const data, const I &i,
  12654. const typename V::AsArg v0, const typename V::AsArg v1,
  12655. const typename V::AsArg v2, const typename V::AsArg v3,
  12656. const typename V::AsArg v4, const typename V::AsArg v5,
  12657. const typename V::AsArg v6, const typename V::AsArg v7)
  12658. {
  12659. interleave(data, i, v0, v1, v2, v3);
  12660. interleave(data + 4, i, v4, v5, v6, v7);
  12661. }
  12662. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12663. const I &i, V &v0, V &v1)
  12664. {
  12665. const __m256i tmp4 =
  12666. _mm256_setr_epi32(
  12667. *aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
  12668. *aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
  12669. *aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
  12670. *aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
  12671. const __m256i tmp5 =
  12672. _mm256_setr_epi32(
  12673. *aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
  12674. *aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
  12675. *aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
  12676. *aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
  12677. const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);
  12678. const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);
  12679. const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
  12680. const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);
  12681. v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
  12682. v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
  12683. }
  12684. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12685. const I &i, V &v0, V &v1, V &v2)
  12686. {
  12687. using namespace AVX;
  12688. const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
  12689. *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
  12690. *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
  12691. const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
  12692. *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
  12693. *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
  12694. const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
  12695. *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
  12696. *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
  12697. const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
  12698. *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
  12699. *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
  12700. const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
  12701. const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
  12702. const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
  12703. const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
  12704. const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
  12705. const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
  12706. const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
  12707. const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
  12708. v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
  12709. v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
  12710. v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
  12711. }
  12712. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12713. const I &i, V &v0, V &v1, V &v2, V &v3)
  12714. {
  12715. using namespace AVX;
  12716. const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
  12717. *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
  12718. *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
  12719. const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
  12720. *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
  12721. *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
  12722. const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
  12723. *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
  12724. *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
  12725. const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
  12726. *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
  12727. *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
  12728. const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
  12729. const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
  12730. const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
  12731. const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
  12732. const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
  12733. const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
  12734. const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
  12735. const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
  12736. v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
  12737. v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
  12738. v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
  12739. v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
  12740. }
  12741. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12742. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  12743. {
  12744. using namespace AVX;
  12745. const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
  12746. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
  12747. const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
  12748. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
  12749. const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
  12750. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
  12751. const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
  12752. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
  12753. const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
  12754. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
  12755. const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
  12756. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
  12757. const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
  12758. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
  12759. const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
  12760. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
  12761. const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
  12762. const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
  12763. const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
  12764. const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
  12765. const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
  12766. const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
  12767. const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
  12768. const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
  12769. const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
  12770. const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
  12771. const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
  12772. const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
  12773. const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
  12774. const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
  12775. v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
  12776. v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
  12777. v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
  12778. v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
  12779. v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
  12780. }
  12781. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12782. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  12783. {
  12784. using namespace AVX;
  12785. const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
  12786. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
  12787. const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
  12788. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
  12789. const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
  12790. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
  12791. const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
  12792. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
  12793. const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
  12794. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
  12795. const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
  12796. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
  12797. const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
  12798. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
  12799. const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
  12800. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
  12801. const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
  12802. const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
  12803. const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
  12804. const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
  12805. const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
  12806. const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
  12807. const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
  12808. const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
  12809. const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
  12810. const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
  12811. const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
  12812. const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
  12813. const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
  12814. const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
  12815. v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
  12816. v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
  12817. v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
  12818. v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
  12819. v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
  12820. v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
  12821. }
  12822. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12823. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  12824. {
  12825. using namespace AVX;
  12826. const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
  12827. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
  12828. const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
  12829. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
  12830. const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
  12831. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
  12832. const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
  12833. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
  12834. const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
  12835. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
  12836. const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
  12837. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
  12838. const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
  12839. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
  12840. const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
  12841. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
  12842. const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
  12843. const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
  12844. const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
  12845. const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
  12846. const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
  12847. const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
  12848. const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
  12849. const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
  12850. const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
  12851. const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
  12852. const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
  12853. const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
  12854. const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
  12855. const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
  12856. const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
  12857. const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
  12858. v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
  12859. v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
  12860. v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
  12861. v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
  12862. v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
  12863. v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
  12864. v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
  12865. }
  12866. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  12867. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  12868. {
  12869. using namespace AVX;
  12870. const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
  12871. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
  12872. const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
  12873. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
  12874. const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
  12875. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
  12876. const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
  12877. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
  12878. const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
  12879. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
  12880. const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
  12881. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
  12882. const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
  12883. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
  12884. const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
  12885. _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
  12886. const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
  12887. const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
  12888. const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
  12889. const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
  12890. const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
  12891. const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
  12892. const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
  12893. const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
  12894. const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
  12895. const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
  12896. const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
  12897. const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
  12898. const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
  12899. const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
  12900. const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
  12901. const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
  12902. v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
  12903. v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
  12904. v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
  12905. v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
  12906. v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
  12907. v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
  12908. v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
  12909. v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
  12910. }
  12911. };
  12912. template<typename V> struct InterleaveImpl<V, 8, 32> {
  12913. static_assert(sizeof(typename V::value_type) == 4, "");
  12914. template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
  12915. const typename V::AsArg v0, const typename V::AsArg v1)
  12916. {
  12917. using namespace AVX;
  12918. const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
  12919. const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
  12920. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
  12921. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
  12922. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
  12923. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
  12924. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
  12925. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
  12926. _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
  12927. _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
  12928. }
  12929. static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
  12930. const typename V::AsArg v0, const typename V::AsArg v1)
  12931. {
  12932. using namespace AVX;
  12933. const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
  12934. const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
  12935. _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
  12936. _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
  12937. _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
  12938. _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
  12939. }
  12940. template <typename I>
  12941. static inline void interleave(typename V::EntryType *const data, const I &i,
  12942. const typename V::AsArg v0, const typename V::AsArg v1,
  12943. const typename V::AsArg v2)
  12944. {
  12945. using namespace AVX;
  12946. #ifdef Vc_USE_MASKMOV_SCATTER
  12947. const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  12948. const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  12949. const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
  12950. const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
  12951. const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
  12952. const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
  12953. const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
  12954. const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
  12955. const m128i mask = _mm_set_epi32(0, -1, -1, -1);
  12956. _mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
  12957. _mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
  12958. _mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
  12959. _mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
  12960. _mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
  12961. _mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
  12962. _mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
  12963. _mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
  12964. #else
  12965. interleave(data, i, v0, v1);
  12966. v2.scatter(data + 2, i);
  12967. #endif
  12968. }
  12969. static inline void interleave(typename V::EntryType *const data,
  12970. const Common::SuccessiveEntries<3> &i,
  12971. const typename V::AsArg v0_,
  12972. const typename V::AsArg v1_,
  12973. const typename V::AsArg v2_)
  12974. {
  12975. __m256 v0 = AVX::avx_cast<__m256>(v0_.data());
  12976. __m256 v1 = AVX::avx_cast<__m256>(v1_.data());
  12977. __m256 v2 = AVX::avx_cast<__m256>(v2_.data());
  12978. v0 = _mm256_shuffle_ps(v0, v0, 0x6c);
  12979. v1 = _mm256_shuffle_ps(v1, v1, 0xb1);
  12980. v2 = _mm256_shuffle_ps(v2, v2, 0xc6);
  12981. __m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
  12982. Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
  12983. __m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
  12984. Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
  12985. __m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
  12986. Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
  12987. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
  12988. _mm256_permute2f128_ps(w0, w1, 0x20));
  12989. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
  12990. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
  12991. _mm256_permute2f128_ps(w1, w0, 0x31));
  12992. }
  12993. template <typename I>
  12994. static inline void interleave(typename V::EntryType *const data, const I &i,
  12995. const typename V::AsArg v0, const typename V::AsArg v1,
  12996. const typename V::AsArg v2, const typename V::AsArg v3)
  12997. {
  12998. using namespace AVX;
  12999. const __m256 tmp0 =
  13000. _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  13001. const __m256 tmp1 =
  13002. _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  13003. const __m256 tmp2 =
  13004. _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
  13005. const __m256 tmp3 =
  13006. _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
  13007. const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
  13008. const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
  13009. const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
  13010. const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
  13011. _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
  13012. _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
  13013. _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
  13014. _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
  13015. _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
  13016. _mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
  13017. _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
  13018. _mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
  13019. }
  13020. static inline void interleave(typename V::EntryType *const data,
  13021. const Common::SuccessiveEntries<4> &i,
  13022. const typename V::AsArg v0, const typename V::AsArg v1,
  13023. const typename V::AsArg v2, const typename V::AsArg v3)
  13024. {
  13025. using namespace AVX;
  13026. const __m256 tmp0 =
  13027. _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  13028. const __m256 tmp1 =
  13029. _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
  13030. const __m256 tmp2 =
  13031. _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
  13032. const __m256 tmp3 =
  13033. _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
  13034. const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
  13035. const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
  13036. const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
  13037. const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
  13038. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
  13039. _mm256_permute2f128_ps(_04, _15, 0x20));
  13040. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
  13041. _mm256_permute2f128_ps(_26, _37, 0x20));
  13042. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
  13043. _mm256_permute2f128_ps(_04, _15, 0x31));
  13044. _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
  13045. _mm256_permute2f128_ps(_26, _37, 0x31));
  13046. }
  13047. template <typename I>
  13048. static inline void interleave(typename V::EntryType *const data, const I &i,
  13049. const typename V::AsArg v0, const typename V::AsArg v1,
  13050. const typename V::AsArg v2, const typename V::AsArg v3,
  13051. const typename V::AsArg v4)
  13052. {
  13053. interleave(data, i, v0, v1, v2, v3);
  13054. v4.scatter(data + 4, i);
  13055. }
  13056. template <typename I>
  13057. static inline void interleave(typename V::EntryType *const data, const I &i,
  13058. const typename V::AsArg v0, const typename V::AsArg v1,
  13059. const typename V::AsArg v2, const typename V::AsArg v3,
  13060. const typename V::AsArg v4, const typename V::AsArg v5)
  13061. {
  13062. interleave(data, i, v0, v1, v2, v3);
  13063. interleave(data + 4, i, v4, v5);
  13064. }
  13065. template <typename I>
  13066. static inline void interleave(typename V::EntryType *const data, const I &i,
  13067. const typename V::AsArg v0, const typename V::AsArg v1,
  13068. const typename V::AsArg v2, const typename V::AsArg v3,
  13069. const typename V::AsArg v4, const typename V::AsArg v5,
  13070. const typename V::AsArg v6)
  13071. {
  13072. interleave(data, i, v0, v1, v2, v3);
  13073. interleave(data + 4, i, v4, v5, v6);
  13074. }
  13075. template <typename I>
  13076. static inline void interleave(typename V::EntryType *const data, const I &i,
  13077. const typename V::AsArg v0, const typename V::AsArg v1,
  13078. const typename V::AsArg v2, const typename V::AsArg v3,
  13079. const typename V::AsArg v4, const typename V::AsArg v5,
  13080. const typename V::AsArg v6, const typename V::AsArg v7)
  13081. {
  13082. interleave(data, i, v0, v1, v2, v3);
  13083. interleave(data + 4, i, v4, v5, v6, v7);
  13084. }
  13085. template <typename I>
  13086. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  13087. V &v0, V &v1)
  13088. {
  13089. using namespace AVX;
  13090. const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]]));
  13091. const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]]));
  13092. const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]]));
  13093. const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]]));
  13094. const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]]));
  13095. const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]]));
  13096. const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]]));
  13097. const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]]));
  13098. const m256 tmp2 = concat(il01, il45);
  13099. const m256 tmp3 = concat(il23, il67);
  13100. const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
  13101. const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
  13102. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
  13103. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
  13104. }
  13105. static inline void deinterleave(typename V::EntryType const *const data,
  13106. const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
  13107. {
  13108. using namespace AVX;
  13109. const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  13110. const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]]));
  13111. const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
  13112. const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
  13113. const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
  13114. const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
  13115. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
  13116. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
  13117. }
  13118. template <typename I>
  13119. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  13120. V &v0, V &v1, V &v2)
  13121. {
  13122. using namespace AVX;
  13123. const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  13124. const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
  13125. const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
  13126. const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
  13127. const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
  13128. const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
  13129. const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
  13130. const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
  13131. const m256 il04 = concat(il0, il4);
  13132. const m256 il15 = concat(il1, il5);
  13133. const m256 il26 = concat(il2, il6);
  13134. const m256 il37 = concat(il3, il7);
  13135. const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
  13136. const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
  13137. const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
  13138. const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
  13139. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
  13140. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
  13141. v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
  13142. }
  13143. static inline void deinterleave(typename V::EntryType const *const data,
  13144. const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
  13145. V &v2)
  13146. {
  13147. __m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
  13148. __m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
  13149. __m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
  13150. const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
  13151. const __m256 cdddeeef = in1;
  13152. const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
  13153. const __m256 x0 = _mm256_blend_ps(
  13154. _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
  13155. bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
  13156. const __m256 x1 = _mm256_blend_ps(
  13157. _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
  13158. bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
  13159. const __m256 x2 = _mm256_blend_ps(
  13160. _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
  13161. bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
  13162. v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
  13163. v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
  13164. v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
  13165. }
  13166. template <typename I>
  13167. static inline void deinterleave(typename V::EntryType const *const data, const I &i,
  13168. V &v0, V &v1, V &v2, V &v3)
  13169. {
  13170. using namespace AVX;
  13171. const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  13172. const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
  13173. const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
  13174. const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
  13175. const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
  13176. const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
  13177. const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
  13178. const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
  13179. const m256 il04 = concat(il0, il4);
  13180. const m256 il15 = concat(il1, il5);
  13181. const m256 il26 = concat(il2, il6);
  13182. const m256 il37 = concat(il3, il7);
  13183. const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
  13184. const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
  13185. const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
  13186. const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
  13187. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
  13188. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
  13189. v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
  13190. v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
  13191. }
  13192. static inline void deinterleave(typename V::EntryType const *const data,
  13193. const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
  13194. V &v2, V &v3)
  13195. {
  13196. using namespace AVX;
  13197. const __m256 il01 = _mm256_loadu_ps(
  13198. aliasing_cast<float>(&data[i[0]]));
  13199. const __m256 il23 = _mm256_loadu_ps(
  13200. aliasing_cast<float>(&data[i[2]]));
  13201. const __m256 il45 = _mm256_loadu_ps(
  13202. aliasing_cast<float>(&data[i[4]]));
  13203. const __m256 il67 = _mm256_loadu_ps(
  13204. aliasing_cast<float>(&data[i[6]]));
  13205. const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
  13206. const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
  13207. const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
  13208. const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
  13209. const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
  13210. const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
  13211. const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
  13212. const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
  13213. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
  13214. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
  13215. v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
  13216. v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
  13217. }
  13218. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13219. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  13220. {
  13221. v4.gather(data + 4, i);
  13222. deinterleave(data, i, v0, v1, v2, v3);
  13223. }
  13224. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13225. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  13226. {
  13227. deinterleave(data, i, v0, v1, v2, v3);
  13228. deinterleave(data + 4, i, v4, v5);
  13229. }
  13230. static inline void deinterleave(typename V::EntryType const *const data,
  13231. const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  13232. {
  13233. using namespace AVX;
  13234. const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
  13235. const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
  13236. const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
  13237. const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
  13238. const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
  13239. const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
  13240. const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
  13241. const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
  13242. const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
  13243. const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
  13244. const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
  13245. const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
  13246. const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
  13247. const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
  13248. const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
  13249. const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
  13250. const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
  13251. const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
  13252. v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
  13253. v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
  13254. v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
  13255. v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
  13256. v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
  13257. v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
  13258. }
  13259. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13260. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  13261. {
  13262. deinterleave(data, i, v0, v1, v2, v3);
  13263. deinterleave(data + 4, i, v4, v5, v6);
  13264. }
  13265. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13266. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  13267. {
  13268. deinterleave(data, i, v0, v1, v2, v3);
  13269. deinterleave(data + 4, i, v4, v5, v6, v7);
  13270. }
  13271. };
  13272. template<typename V> struct InterleaveImpl<V, 4, 32> {
  13273. template <typename I>
  13274. static inline void interleave(typename V::EntryType *const data, const I &i,
  13275. const typename V::AsArg v0, const typename V::AsArg v1)
  13276. {
  13277. using namespace AVX;
  13278. const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
  13279. const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
  13280. _mm_storeu_pd(&data[i[0]], lo128(tmp0));
  13281. _mm_storeu_pd(&data[i[1]], lo128(tmp1));
  13282. _mm_storeu_pd(&data[i[2]], hi128(tmp0));
  13283. _mm_storeu_pd(&data[i[3]], hi128(tmp1));
  13284. }
  13285. template <typename I>
  13286. static inline void interleave(typename V::EntryType *const data, const I &i,
  13287. const typename V::AsArg v0, const typename V::AsArg v1,
  13288. const typename V::AsArg v2)
  13289. {
  13290. using namespace AVX;
  13291. #ifdef Vc_USE_MASKMOV_SCATTER
  13292. const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
  13293. const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
  13294. const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
  13295. const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
  13296. #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
  13297. const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
  13298. #else
  13299. const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
  13300. #endif
  13301. _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
  13302. _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
  13303. _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
  13304. _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
  13305. #else
  13306. interleave(data, i, v0, v1);
  13307. v2.scatter(data + 2, i);
  13308. #endif
  13309. }
  13310. template <typename I>
  13311. static inline void interleave(typename V::EntryType *const data, const I &i,
  13312. const typename V::AsArg v0, const typename V::AsArg v1,
  13313. const typename V::AsArg v2, const typename V::AsArg v3)
  13314. {
  13315. using namespace AVX;
  13316. const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
  13317. const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
  13318. const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
  13319. const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
  13320. _mm_storeu_pd(&data[i[0] ], lo128(tmp0));
  13321. _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
  13322. _mm_storeu_pd(&data[i[1] ], lo128(tmp1));
  13323. _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
  13324. _mm_storeu_pd(&data[i[2] ], hi128(tmp0));
  13325. _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
  13326. _mm_storeu_pd(&data[i[3] ], hi128(tmp1));
  13327. _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
  13328. }
  13329. template <typename I>
  13330. static inline void interleave(typename V::EntryType *const data, const I &i,
  13331. const typename V::AsArg v0, const typename V::AsArg v1,
  13332. const typename V::AsArg v2, const typename V::AsArg v3,
  13333. const typename V::AsArg v4)
  13334. {
  13335. interleave(data, i, v0, v1, v2, v3);
  13336. v4.scatter(data + 4, i);
  13337. }
  13338. template <typename I>
  13339. static inline void interleave(typename V::EntryType *const data, const I &i,
  13340. const typename V::AsArg v0, const typename V::AsArg v1,
  13341. const typename V::AsArg v2, const typename V::AsArg v3,
  13342. const typename V::AsArg v4, const typename V::AsArg v5)
  13343. {
  13344. interleave(data, i, v0, v1, v2, v3);
  13345. interleave(data + 4, i, v4, v5);
  13346. }
  13347. template <typename I>
  13348. static inline void interleave(typename V::EntryType *const data, const I &i,
  13349. const typename V::AsArg v0, const typename V::AsArg v1,
  13350. const typename V::AsArg v2, const typename V::AsArg v3,
  13351. const typename V::AsArg v4, const typename V::AsArg v5,
  13352. const typename V::AsArg v6)
  13353. {
  13354. interleave(data, i, v0, v1, v2, v3);
  13355. interleave(data + 4, i, v4, v5, v6);
  13356. }
  13357. template <typename I>
  13358. static inline void interleave(typename V::EntryType *const data, const I &i,
  13359. const typename V::AsArg v0, const typename V::AsArg v1,
  13360. const typename V::AsArg v2, const typename V::AsArg v3,
  13361. const typename V::AsArg v4, const typename V::AsArg v5,
  13362. const typename V::AsArg v6, const typename V::AsArg v7)
  13363. {
  13364. interleave(data, i, v0, v1, v2, v3);
  13365. interleave(data + 4, i, v4, v5, v6, v7);
  13366. }
  13367. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13368. const I &i, V &v0, V &v1)
  13369. {
  13370. using namespace Vc::AVX;
  13371. const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
  13372. const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
  13373. v0.data() = _mm256_unpacklo_pd(ab02, ab13);
  13374. v1.data() = _mm256_unpackhi_pd(ab02, ab13);
  13375. }
  13376. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13377. const I &i, V &v0, V &v1, V &v2)
  13378. {
  13379. v2.gather(data + 2, i);
  13380. deinterleave(data, i, v0, v1);
  13381. }
  13382. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13383. const I &i, V &v0, V &v1, V &v2, V &v3)
  13384. {
  13385. deinterleave(data, i, v0, v1);
  13386. deinterleave(data + 2, i, v2, v3);
  13387. }
  13388. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13389. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
  13390. {
  13391. v4.gather(data + 4, i);
  13392. deinterleave(data, i, v0, v1);
  13393. deinterleave(data + 2, i, v2, v3);
  13394. }
  13395. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13396. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
  13397. {
  13398. deinterleave(data, i, v0, v1);
  13399. deinterleave(data + 2, i, v2, v3);
  13400. deinterleave(data + 4, i, v4, v5);
  13401. }
  13402. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13403. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
  13404. {
  13405. v6.gather(data + 6, i);
  13406. deinterleave(data, i, v0, v1);
  13407. deinterleave(data + 2, i, v2, v3);
  13408. deinterleave(data + 4, i, v4, v5);
  13409. }
  13410. template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
  13411. const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
  13412. {
  13413. deinterleave(data, i, v0, v1);
  13414. deinterleave(data + 2, i, v2, v3);
  13415. deinterleave(data + 4, i, v4, v5);
  13416. deinterleave(data + 6, i, v6, v7);
  13417. }
  13418. };
  13419. }
  13420. }
  13421. #endif
  13422. namespace Vc_VERSIONED_NAMESPACE
  13423. {
  13424. template <typename T> class Mask<T, VectorAbi::Avx>
  13425. {
  13426. public:
  13427. using abi = VectorAbi::Avx;
  13428. typedef bool EntryType;
  13429. using value_type = EntryType;
  13430. using MaskBool = Common::MaskBool<sizeof(T)>;
  13431. using VectorEntryType = MaskBool;
  13432. using Vector = AVX2::Vector<T>;
  13433. using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
  13434. using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
  13435. using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;
  13436. private:
  13437. typedef const VectorTypeF VArg;
  13438. typedef const VectorTypeD VdArg;
  13439. typedef const VectorTypeI ViArg;
  13440. public:
  13441. static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
  13442. static constexpr size_t MemoryAlignment = Size;
  13443. static constexpr std::size_t size() { return Size; }
  13444. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
  13445. private:
  13446. typedef Common::Storage<T, Size> Storage;
  13447. public:
  13448. using VectorType = typename Storage::VectorType;
  13449. using EntryReference = Vc::Detail::ElementReference<Mask>;
  13450. using reference = EntryReference;
  13451. #if defined Vc_MSVC && defined _WIN32
  13452. typedef const Mask &AsArg;
  13453. #else
  13454. typedef const Mask AsArg;
  13455. #endif
  13456. Vc_INTRINSIC Mask() {}
  13457. Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast<VectorType>(x)) {}
  13458. Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
  13459. Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
  13460. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
  13461. Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
  13462. Vc_INTRINSIC explicit Mask(bool b)
  13463. : d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
  13464. {
  13465. }
  13466. Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
  13467. Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
  13468. template <typename U>
  13469. Vc_INTRINSIC Mask(
  13470. U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
  13471. : d(AVX::avx_cast<VectorType>(
  13472. Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
  13473. rhs.dataI())))
  13474. {
  13475. }
  13476. #if Vc_IS_VERSION_1
  13477. template <typename U>
  13478. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  13479. "mask types") Vc_INTRINSIC
  13480. explicit Mask(U &&rhs,
  13481. Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
  13482. #endif
  13483. template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }
  13484. template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());
  13485. template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;
  13486. Vc_INTRINSIC Mask &operator=(const Mask &) = default;
  13487. Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
  13488. Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;
  13489. Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
  13490. { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }
  13491. Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
  13492. { return !operator==(rhs); }
  13493. Vc_INTRINSIC Mask operator!() const
  13494. {
  13495. #ifdef Vc_GCC
  13496. return ~dataI();
  13497. #else
  13498. return Detail::andnot_(dataF(), Detail::allone<VectorTypeF>());
  13499. #endif
  13500. }
  13501. Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
  13502. Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
  13503. Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }
  13504. Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
  13505. Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
  13506. Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }
  13507. Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
  13508. Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
  13509. Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
  13510. Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
  13511. Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
  13512. Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;
  13513. Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
  13514. Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
  13515. Vc_INTRINSIC VectorType data () const { return d.v(); }
  13516. Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast<VectorTypeF>(d.v()); }
  13517. Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
  13518. Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }
  13519. private:
  13520. friend reference;
  13521. static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
  13522. {
  13523. return m.toInt() & (1 << i);
  13524. }
  13525. template <typename U>
  13526. static Vc_INTRINSIC void set(Mask &m, int i,
  13527. U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
  13528. {
  13529. m.d.set(i, MaskBool(std::forward<U>(v)));
  13530. }
  13531. public:
  13532. Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
  13533. {
  13534. return {*this, int(index)};
  13535. }
  13536. Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
  13537. {
  13538. return get(*this, index);
  13539. }
  13540. Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
  13541. Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }
  13542. template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
  13543. Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
  13544. private:
  13545. #ifdef Vc_COMPILE_BENCHMARKS
  13546. public:
  13547. #endif
  13548. Storage d;
  13549. };
  13550. template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
  13551. template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;
  13552. }
  13553. namespace Vc_VERSIONED_NAMESPACE
  13554. {
  13555. template <typename T>
  13556. template <typename Flags>
  13557. Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
  13558. {
  13559. Detail::mask_store<Size>(dataI(), mem, f);
  13560. }
  13561. template <typename T>
  13562. template <typename Flags>
  13563. Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
  13564. {
  13565. d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
  13566. }
  13567. #ifdef Vc_IMPL_AVX2
  13568. template <>
  13569. Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
  13570. int index) noexcept
  13571. {
  13572. return m.shiftMask() & (1 << 2 * index);
  13573. }
  13574. template <>
  13575. Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
  13576. int index) noexcept
  13577. {
  13578. return m.shiftMask() & (1 << 2 * index);
  13579. }
  13580. #endif
  13581. template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
  13582. { return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
  13583. #ifdef Vc_IMPL_AVX2
  13584. template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
  13585. { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
  13586. template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
  13587. { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
  13588. #endif
  13589. template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
  13590. if (sizeof(T) == 8) {
  13591. return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
  13592. } else if (sizeof(T) == 4) {
  13593. return 0 != Detail::testc(dataF(), Detail::allone<VectorTypeF>());
  13594. } else {
  13595. return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
  13596. }
  13597. }
  13598. template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
  13599. if (sizeof(T) == 8) {
  13600. return 0 == Detail::testz(dataD(), dataD());
  13601. } else if (sizeof(T) == 4) {
  13602. return 0 == Detail::testz(dataF(), dataF());
  13603. } else {
  13604. return 0 == Detail::testz(dataI(), dataI());
  13605. }
  13606. }
  13607. template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
  13608. if (sizeof(T) == 8) {
  13609. return 0 != Detail::testz(dataD(), dataD());
  13610. } else if (sizeof(T) == 4) {
  13611. return 0 != Detail::testz(dataF(), dataF());
  13612. } else {
  13613. return 0 != Detail::testz(dataI(), dataI());
  13614. }
  13615. }
  13616. template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
  13617. if (sizeof(T) == 8) {
  13618. return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
  13619. } else if (sizeof(T) == 4) {
  13620. return 0 != Detail::testnzc(dataF(), Detail::allone<VectorTypeF>());
  13621. } else {
  13622. return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
  13623. }
  13624. }
  13625. template <typename M, typename G>
  13626. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
  13627. {
  13628. return _mm256_setr_epi64x(
  13629. gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
  13630. gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
  13631. }
  13632. template <typename M, typename G>
  13633. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
  13634. {
  13635. return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
  13636. gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
  13637. gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
  13638. gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
  13639. }
  13640. template <typename M, typename G>
  13641. Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
  13642. {
  13643. return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
  13644. gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
  13645. gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
  13646. gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
  13647. gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
  13648. gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
  13649. gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
  13650. gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
  13651. }
  13652. template <typename T>
  13653. template <typename G>
  13654. Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
  13655. {
  13656. return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
  13657. std::integral_constant<int, Size + sizeof(Storage)>());
  13658. }
  13659. template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
  13660. {
  13661. switch (amount * int(sizeof(VectorEntryType))) {
  13662. case 0: return *this;
  13663. case 1: return Detail::shifted< 1>(dataI());
  13664. case 2: return Detail::shifted< 2>(dataI());
  13665. case 3: return Detail::shifted< 3>(dataI());
  13666. case 4: return Detail::shifted< 4>(dataI());
  13667. case 5: return Detail::shifted< 5>(dataI());
  13668. case 6: return Detail::shifted< 6>(dataI());
  13669. case 7: return Detail::shifted< 7>(dataI());
  13670. case 8: return Detail::shifted< 8>(dataI());
  13671. case 9: return Detail::shifted< 9>(dataI());
  13672. case 10: return Detail::shifted< 10>(dataI());
  13673. case 11: return Detail::shifted< 11>(dataI());
  13674. case 12: return Detail::shifted< 12>(dataI());
  13675. case 13: return Detail::shifted< 13>(dataI());
  13676. case 14: return Detail::shifted< 14>(dataI());
  13677. case 15: return Detail::shifted< 15>(dataI());
  13678. case 16: return Detail::shifted< 16>(dataI());
  13679. case 17: return Detail::shifted< 17>(dataI());
  13680. case 18: return Detail::shifted< 18>(dataI());
  13681. case 19: return Detail::shifted< 19>(dataI());
  13682. case 20: return Detail::shifted< 20>(dataI());
  13683. case 21: return Detail::shifted< 21>(dataI());
  13684. case 22: return Detail::shifted< 22>(dataI());
  13685. case 23: return Detail::shifted< 23>(dataI());
  13686. case 24: return Detail::shifted< 24>(dataI());
  13687. case 25: return Detail::shifted< 25>(dataI());
  13688. case 26: return Detail::shifted< 26>(dataI());
  13689. case 27: return Detail::shifted< 27>(dataI());
  13690. case 28: return Detail::shifted< 28>(dataI());
  13691. case 29: return Detail::shifted< 29>(dataI());
  13692. case 30: return Detail::shifted< 30>(dataI());
  13693. case 31: return Detail::shifted< 31>(dataI());
  13694. case -1: return Detail::shifted< -1>(dataI());
  13695. case -2: return Detail::shifted< -2>(dataI());
  13696. case -3: return Detail::shifted< -3>(dataI());
  13697. case -4: return Detail::shifted< -4>(dataI());
  13698. case -5: return Detail::shifted< -5>(dataI());
  13699. case -6: return Detail::shifted< -6>(dataI());
  13700. case -7: return Detail::shifted< -7>(dataI());
  13701. case -8: return Detail::shifted< -8>(dataI());
  13702. case -9: return Detail::shifted< -9>(dataI());
  13703. case -10: return Detail::shifted<-10>(dataI());
  13704. case -11: return Detail::shifted<-11>(dataI());
  13705. case -12: return Detail::shifted<-12>(dataI());
  13706. case -13: return Detail::shifted<-13>(dataI());
  13707. case -14: return Detail::shifted<-14>(dataI());
  13708. case -15: return Detail::shifted<-15>(dataI());
  13709. case -16: return Detail::shifted<-16>(dataI());
  13710. case -17: return Detail::shifted<-17>(dataI());
  13711. case -18: return Detail::shifted<-18>(dataI());
  13712. case -19: return Detail::shifted<-19>(dataI());
  13713. case -20: return Detail::shifted<-20>(dataI());
  13714. case -21: return Detail::shifted<-21>(dataI());
  13715. case -22: return Detail::shifted<-22>(dataI());
  13716. case -23: return Detail::shifted<-23>(dataI());
  13717. case -24: return Detail::shifted<-24>(dataI());
  13718. case -25: return Detail::shifted<-25>(dataI());
  13719. case -26: return Detail::shifted<-26>(dataI());
  13720. case -27: return Detail::shifted<-27>(dataI());
  13721. case -28: return Detail::shifted<-28>(dataI());
  13722. case -29: return Detail::shifted<-29>(dataI());
  13723. case -30: return Detail::shifted<-30>(dataI());
  13724. case -31: return Detail::shifted<-31>(dataI());
  13725. }
  13726. return Zero();
  13727. }
  13728. }
  13729. #endif
  13730. #include <algorithm>
  13731. #include <cmath>
  13732. #ifdef isfinite
  13733. #undef isfinite
  13734. #endif
  13735. #ifdef isnan
  13736. #undef isnan
  13737. #endif
  13738. namespace Vc_VERSIONED_NAMESPACE
  13739. {
  13740. namespace Detail
  13741. {
  13742. template <typename T, typename Abi> struct VectorTraits
  13743. {
  13744. using mask_type = Vc::Mask<T, Abi>;
  13745. using vector_type = Vc::Vector<T, Abi>;
  13746. using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
  13747. using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
  13748. };
  13749. }
  13750. #define Vc_CURRENT_CLASS_NAME Vector
  13751. template <typename T> class Vector<T, VectorAbi::Avx>
  13752. {
  13753. public:
  13754. using abi = VectorAbi::Avx;
  13755. private:
  13756. using traits_type = Detail::VectorTraits<T, abi>;
  13757. static_assert(
  13758. std::is_arithmetic<T>::value,
  13759. "Vector<T> only accepts arithmetic builtin types as template parameter T.");
  13760. using WriteMaskedVector = typename traits_type::writemasked_vector_type;
  13761. public:
  13762. using VectorType = typename traits_type::intrinsic_type;
  13763. using vector_type = VectorType;
  13764. using mask_type = typename traits_type::mask_type;
  13765. using Mask = mask_type;
  13766. using MaskType = mask_type;
  13767. using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
  13768. using MaskArgument = typename Mask::AsArg;
  13769. using reference = Detail::ElementReference<Vector>;
  13770. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
  13771. using EntryType = T;
  13772. using value_type = EntryType;
  13773. typedef EntryType VectorEntryType;
  13774. static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
  13775. static constexpr size_t MemoryAlignment = alignof(VectorType);
  13776. using IndexType = fixed_size_simd<int, Size>;
  13777. typedef Vector<T, abi> AsArg;
  13778. typedef VectorType VectorTypeArg;
  13779. protected:
  13780. template <typename U> using V = Vector<U, abi>;
  13781. typedef AVX::VectorHelper<VectorType> HV;
  13782. typedef AVX::VectorHelper<T> HT;
  13783. template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
  13784. {
  13785. return AVX::avx_cast<VectorType>(v);
  13786. }
  13787. typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
  13788. StorageType d;
  13789. using WidthT = Common::WidthT<VectorType>;
  13790. public:
  13791. public:
  13792. Vc_INTRINSIC Vector() = default;
  13793. static constexpr std::size_t size() { return Size; }
  13794. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
  13795. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
  13796. explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
  13797. static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
  13798. static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
  13799. static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
  13800. {
  13801. return Vector(Vc::IndexesFromZero);
  13802. }
  13803. template <class G, int = 0,
  13804. class = typename std::enable_if<std::is_convertible<
  13805. decltype(std::declval<G>()(size_t())), value_type>::value>::type>
  13806. explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
  13807. {
  13808. }
  13809. static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;
  13810. Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}
  13811. template <typename U>
  13812. Vc_INTRINSIC Vector(
  13813. V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
  13814. void *>::type = nullptr)
  13815. : d(AVX::convert<U, T>(x.data()))
  13816. {
  13817. }
  13818. #if Vc_IS_VERSION_1
  13819. template <typename U>
  13820. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  13821. "vector types") Vc_INTRINSIC explicit Vector(
  13822. V<U> x,
  13823. typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
  13824. void *>::type = nullptr)
  13825. : d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
  13826. {
  13827. }
  13828. template <typename U,
  13829. typename = enable_if<Traits::is_simd_vector<U>::value &&
  13830. !std::is_same<Vector, Traits::decay<U>>::value>>
  13831. Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
  13832. "vector types") Vc_INTRINSIC_L
  13833. explicit Vector(U &&x) Vc_INTRINSIC_R;
  13834. #endif
  13835. Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
  13836. template <typename U>
  13837. Vc_INTRINSIC Vector(U a,
  13838. typename std::enable_if<std::is_same<U, int>::value &&
  13839. !std::is_same<U, EntryType>::value,
  13840. void *>::type = nullptr)
  13841. : Vector(static_cast<EntryType>(a))
  13842. {
  13843. }
  13844. explicit Vector(std::initializer_list<EntryType>)
  13845. {
  13846. static_assert(std::is_same<EntryType, void>::value,
  13847. "A SIMD vector object cannot be initialized from an initializer list "
  13848. "because the number of entries in the vector is target-dependent.");
  13849. }
  13850. explicit Vc_INTRINSIC Vector(const EntryType *mem)
  13851. {
  13852. load(mem);
  13853. }
  13854. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  13855. explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
  13856. {
  13857. load(mem, flags);
  13858. }
  13859. template <typename U, typename Flags = DefaultLoadTag,
  13860. typename = enable_if<
  13861. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  13862. sizeof(EntryType) >= sizeof(U)) &&
  13863. std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  13864. explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
  13865. {
  13866. load<U, Flags>(x, flags);
  13867. }
  13868. Vc_INTRINSIC void load(const EntryType *mem)
  13869. {
  13870. load(mem, DefaultLoadTag());
  13871. }
  13872. template <typename Flags>
  13873. Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
  13874. load(const EntryType *mem, Flags flags)
  13875. {
  13876. load<EntryType, Flags>(mem, flags);
  13877. }
  13878. private:
  13879. template <typename U, typename Flags>
  13880. struct load_concept : public std::enable_if<
  13881. (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
  13882. sizeof(EntryType) >= sizeof(U)) &&
  13883. std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
  13884. {};
  13885. public:
  13886. template <typename U, typename Flags = DefaultLoadTag>
  13887. Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
  13888. template <
  13889. typename U,
  13890. typename Flags = DefaultStoreTag,
  13891. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  13892. Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
  13893. template <
  13894. typename U,
  13895. typename Flags = DefaultStoreTag,
  13896. typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
  13897. Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
  13898. Vc_INTRINSIC void store(EntryType *mem) const
  13899. {
  13900. store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
  13901. }
  13902. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  13903. Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
  13904. {
  13905. store<EntryType, Flags>(mem, flags);
  13906. }
  13907. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
  13908. {
  13909. store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
  13910. }
  13911. template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
  13912. Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
  13913. {
  13914. store<EntryType, Flags>(mem, mask, flags);
  13915. }
  13916. Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
  13917. Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
  13918. Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
  13919. Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
  13920. Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;
  13921. #ifndef Vc_CURRENT_CLASS_NAME
  13922. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  13923. #endif
  13924. private:
  13925. template <class MT, class IT, int Scale = 1>
  13926. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  13927. template <class MT, class IT, int Scale = 1>
  13928. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  13929. MaskArgument mask);
  13930. public:
  13931. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  13932. static_assert( \
  13933. std::is_convertible<MT, EntryType>::value, \
  13934. "The memory pointer needs to point to a type that can be converted to the " \
  13935. "EntryType of this SIMD vector type."); \
  13936. static_assert( \
  13937. Vc::Traits::has_subscript_operator<IT>::value, \
  13938. "The indexes argument must be a type that implements the subscript operator."); \
  13939. static_assert( \
  13940. !Traits::is_simd_vector<IT>::value || \
  13941. Traits::simd_vector_size<IT>::value >= Size, \
  13942. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  13943. "have at least as many entries as this SIMD vector."); \
  13944. static_assert( \
  13945. !std::is_array<T>::value || \
  13946. (std::rank<T>::value == 1 && \
  13947. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  13948. "If you use a simple array for the indexes parameter, the array must have " \
  13949. "at least as many entries as this SIMD vector.")
  13950. template <typename MT, typename IT,
  13951. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  13952. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  13953. {
  13954. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13955. gatherImplementation(
  13956. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  13957. }
  13958. template <class MT, class IT, int Scale>
  13959. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  13960. {
  13961. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13962. gatherImplementation(args);
  13963. }
  13964. template <typename MT, typename IT,
  13965. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  13966. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  13967. MaskArgument mask)
  13968. {
  13969. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13970. gatherImplementation(
  13971. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  13972. }
  13973. template <class MT, class IT, int Scale>
  13974. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  13975. MaskArgument mask)
  13976. {
  13977. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13978. gatherImplementation(args, mask);
  13979. }
  13980. template <typename MT, typename IT,
  13981. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  13982. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  13983. {
  13984. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13985. gatherImplementation(
  13986. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  13987. }
  13988. template <typename MT, typename IT,
  13989. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  13990. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  13991. {
  13992. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  13993. gatherImplementation(
  13994. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  13995. }
  13996. template <class MT, class IT, int Scale>
  13997. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  13998. {
  13999. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  14000. gatherImplementation(args);
  14001. }
  14002. template <class MT, class IT, int Scale>
  14003. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  14004. MaskArgument mask)
  14005. {
  14006. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  14007. gatherImplementation(args, mask);
  14008. }
  14009. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  14010. private:
  14011. template <typename MT, typename IT>
  14012. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  14013. template <typename MT, typename IT>
  14014. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  14015. public:
  14016. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  14017. static_assert( \
  14018. std::is_convertible<EntryType, MT>::value, \
  14019. "The memory pointer needs to point to a type that the EntryType of this " \
  14020. "SIMD vector type can be converted to."); \
  14021. static_assert( \
  14022. Vc::Traits::has_subscript_operator<IT>::value, \
  14023. "The indexes argument must be a type that implements the subscript operator."); \
  14024. static_assert( \
  14025. !Traits::is_simd_vector<IT>::value || \
  14026. Traits::simd_vector_size<IT>::value >= Size, \
  14027. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  14028. "have at least as many entries as this SIMD vector."); \
  14029. static_assert( \
  14030. !std::is_array<T>::value || \
  14031. (std::rank<T>::value == 1 && \
  14032. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  14033. "If you use a simple array for the indexes parameter, the array must have " \
  14034. "at least as many entries as this SIMD vector.")
  14035. template <typename MT,
  14036. typename IT,
  14037. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  14038. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  14039. {
  14040. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  14041. scatterImplementation(mem, std::forward<IT>(indexes));
  14042. }
  14043. template <typename MT,
  14044. typename IT,
  14045. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  14046. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  14047. {
  14048. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  14049. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  14050. }
  14051. template <typename MT, typename IT>
  14052. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  14053. {
  14054. scatter(args.address, args.indexes);
  14055. }
  14056. template <typename MT, typename IT>
  14057. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  14058. {
  14059. scatter(args.address, args.indexes, mask);
  14060. }
  14061. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  14062. #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
  14063. template <class U, class A, int Scale, int N = Vector<U, A>::size(),
  14064. class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
  14065. Vc_INTRINSIC void gatherImplementation(
  14066. const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
  14067. {
  14068. d.v() = AVX::gather<sizeof(T) * Scale>(
  14069. args.address,
  14070. simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
  14071. .data());
  14072. }
  14073. template <class U, class A, int Scale, int N = Vector<U, A>::size(),
  14074. class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
  14075. Vc_INTRINSIC void gatherImplementation(
  14076. const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
  14077. {
  14078. d.v() = AVX::gather<sizeof(T) * Scale>(
  14079. d.v(), k.data(), args.address,
  14080. simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
  14081. .data());
  14082. }
  14083. template <
  14084. class MT, class U, class A, int Scale,
  14085. class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
  14086. (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
  14087. Vc_INTRINSIC void gatherImplementation(
  14088. const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
  14089. {
  14090. using AVX2::int_v;
  14091. const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
  14092. const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
  14093. *this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
  14094. aliasing_cast<int>(args.address), idx0)),
  14095. int_v(AVX::gather<sizeof(MT) * Scale>(
  14096. aliasing_cast<int>(args.address), idx1)));
  14097. if (sizeof(MT) == 1) {
  14098. if (std::is_signed<MT>::value) {
  14099. using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
  14100. *this = (simd_cast<Signed>(*this) << 8) >> 8;
  14101. } else {
  14102. *this &= 0xff;
  14103. }
  14104. }
  14105. }
  14106. template <
  14107. class MT, class U, class A, int Scale,
  14108. class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
  14109. (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
  14110. Vc_INTRINSIC void gatherImplementation(
  14111. const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
  14112. {
  14113. using AVX2::int_v;
  14114. const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
  14115. const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
  14116. const auto k0 = simd_cast<AVX2::int_m, 0>(k).data();
  14117. const auto k1 = simd_cast<AVX2::int_m, 1>(k).data();
  14118. auto v = simd_cast<Vector>(
  14119. int_v(AVX::gather<sizeof(MT) * Scale>(
  14120. _mm256_setzero_si256(), k0, aliasing_cast<int>(args.address), idx0)),
  14121. int_v(AVX::gather<sizeof(MT) * Scale>(
  14122. _mm256_setzero_si256(), k1, aliasing_cast<int>(args.address), idx1)));
  14123. if (sizeof(MT) == 1) {
  14124. if (std::is_signed<MT>::value) {
  14125. using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
  14126. v = (simd_cast<Signed>(v) << 8) >> 8;
  14127. } else {
  14128. v &= 0xff;
  14129. }
  14130. }
  14131. assign(v, k);
  14132. }
  14133. template <class MT, class U, class A, int Scale>
  14134. Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
  14135. Traits::is_valid_vector_argument<MT>::value &&
  14136. !std::is_same<MT, T>::value &&
  14137. Vector<U, A>::size() >= size()),
  14138. void>
  14139. gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
  14140. {
  14141. *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
  14142. }
  14143. template <class MT, class U, class A, int Scale>
  14144. Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
  14145. Traits::is_valid_vector_argument<MT>::value &&
  14146. !std::is_same<MT, T>::value &&
  14147. Vector<U, A>::size() >= size()),
  14148. void>
  14149. gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
  14150. MaskArgument k)
  14151. {
  14152. assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
  14153. }
  14154. #endif
  14155. Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
  14156. Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
  14157. Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
  14158. Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }
  14159. private:
  14160. friend reference;
  14161. Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
  14162. {
  14163. return o.d.m(i);
  14164. }
  14165. template <typename U>
  14166. Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
  14167. noexcept(std::declval<value_type &>() = v))
  14168. {
  14169. return o.d.set(i, v);
  14170. }
  14171. public:
  14172. Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
  14173. {
  14174. static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
  14175. return {*this, int(index)};
  14176. }
  14177. Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
  14178. {
  14179. return d.m(index);
  14180. }
  14181. Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
  14182. Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;
  14183. Vc_INTRINSIC Vc_PURE Mask operator!() const
  14184. {
  14185. return *this == Zero();
  14186. }
  14187. Vc_ALWAYS_INLINE Vector operator~() const
  14188. {
  14189. #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
  14190. static_assert(std::is_integral<T>::value,
  14191. "bit-complement can only be used with Vectors of integral type");
  14192. #endif
  14193. return Detail::andnot_(data(), Detail::allone<VectorType>());
  14194. }
  14195. Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
  14196. Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
  14197. #define Vc_OP_VEC(op) \
  14198. Vc_INTRINSIC Vector &operator op##=(AsArg x); \
  14199. Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \
  14200. { \
  14201. static_assert( \
  14202. std::is_integral<T>::value, \
  14203. "bitwise-operators can only be used with Vectors of integral type"); \
  14204. }
  14205. Vc_ALL_SHIFTS(Vc_OP_VEC);
  14206. #undef Vc_OP_VEC
  14207. Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
  14208. Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
  14209. Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
  14210. Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;
  14211. Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
  14212. isNegative() const
  14213. {
  14214. return Vc::isnegative(*this);
  14215. }
  14216. Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
  14217. data() = Detail::blend(data(), v.data(), mask.data());
  14218. }
  14219. template <typename V2>
  14220. Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
  14221. staticCast() const
  14222. {
  14223. return V2(*this);
  14224. }
  14225. template <typename V2>
  14226. Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
  14227. reinterpretCast() const
  14228. {
  14229. return AVX::avx_cast<typename V2::VectorType>(data());
  14230. }
  14231. Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
  14232. {
  14233. return {*this, k};
  14234. }
  14235. Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
  14236. Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }
  14237. template<int Index>
  14238. Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
  14239. Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
  14240. Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;
  14241. Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
  14242. Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
  14243. Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
  14244. Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
  14245. Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
  14246. Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
  14247. Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
  14248. Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
  14249. Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;
  14250. Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
  14251. Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
  14252. Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
  14253. Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
  14254. Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
  14255. template <typename F> void callWithValuesSorted(F &&f)
  14256. {
  14257. EntryType value = d.m(0);
  14258. f(value);
  14259. for (size_t i = 1; i < Size; ++i) {
  14260. if (d.m(i) != value) {
  14261. value = d.m(i);
  14262. f(value);
  14263. }
  14264. }
  14265. }
  14266. template <typename F> Vc_INTRINSIC void call(F &&f) const
  14267. {
  14268. Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
  14269. }
  14270. template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
  14271. {
  14272. for (size_t i : where(mask)) {
  14273. f(EntryType(d.m(i)));
  14274. }
  14275. }
  14276. template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
  14277. {
  14278. Vector r;
  14279. Common::for_all_vector_entries<Size>(
  14280. [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
  14281. return r;
  14282. }
  14283. template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
  14284. {
  14285. Vector r(*this);
  14286. for (size_t i : where(mask)) {
  14287. r.d.set(i, f(EntryType(r.d.m(i))));
  14288. }
  14289. return r;
  14290. }
  14291. template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
  14292. Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
  14293. }
  14294. Vc_INTRINSIC void fill(EntryType (&f)()) {
  14295. Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
  14296. }
  14297. template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
  14298. Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
  14299. copySign(AsArg x) const
  14300. {
  14301. return Vc::copysign(*this, x);
  14302. }
  14303. Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
  14304. {
  14305. Vc::exponent(*this);
  14306. }
  14307. Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
  14308. Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
  14309. };
  14310. #undef Vc_CURRENT_CLASS_NAME
  14311. template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
  14312. template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;
  14313. #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
  14314. template <Operator O, typename T, typename M, typename U> \
  14315. Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
  14316. AVX2::Vector<T> &lhs, M &&mask, U &&rhs) \
  14317. { \
  14318. lhs(mask) op_ rhs; \
  14319. } \
  14320. Vc_NOTHING_EXPECTING_SEMICOLON
  14321. Vc_CONDITIONAL_ASSIGN( Assign, =);
  14322. Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
  14323. Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
  14324. Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
  14325. Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
  14326. Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
  14327. Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
  14328. Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
  14329. Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
  14330. Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
  14331. Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
  14332. #undef Vc_CONDITIONAL_ASSIGN
  14333. #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
  14334. template <Operator O, typename T, typename M> \
  14335. Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign( \
  14336. AVX2::Vector<T> &lhs, M &&mask) \
  14337. { \
  14338. return expr_; \
  14339. } \
  14340. Vc_NOTHING_EXPECTING_SEMICOLON
  14341. Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
  14342. Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
  14343. Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
  14344. Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
  14345. #undef Vc_CONDITIONAL_ASSIGN
  14346. }
  14347. #ifndef VC_AVX_LIMITS_H_
  14348. #define VC_AVX_LIMITS_H_
  14349. namespace std
  14350. {
  14351. #define Vc_NUM_LIM(T,_max,_min) \
  14352. template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> { \
  14353. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT \
  14354. { \
  14355. return _max; \
  14356. } \
  14357. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT \
  14358. { \
  14359. return _min; \
  14360. } \
  14361. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT \
  14362. { \
  14363. return min(); \
  14364. } \
  14365. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT \
  14366. { \
  14367. return Vc::AVX2::Vector<T>::Zero(); \
  14368. } \
  14369. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT \
  14370. { \
  14371. return Vc::AVX2::Vector<T>::Zero(); \
  14372. } \
  14373. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT \
  14374. { \
  14375. return Vc::AVX2::Vector<T>::Zero(); \
  14376. } \
  14377. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT \
  14378. { \
  14379. return Vc::AVX2::Vector<T>::Zero(); \
  14380. } \
  14381. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT \
  14382. { \
  14383. return Vc::AVX2::Vector<T>::Zero(); \
  14384. } \
  14385. static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT \
  14386. { \
  14387. return Vc::AVX2::Vector<T>::Zero(); \
  14388. } \
  14389. }
  14390. #ifdef Vc_IMPL_AVX2
  14391. Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
  14392. Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
  14393. Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
  14394. Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
  14395. #endif
  14396. #undef Vc_NUM_LIM
  14397. }
  14398. #endif
  14399. #ifndef VC_AVX_CONST_H_
  14400. #define VC_AVX_CONST_H_
  14401. #include <cstddef>
  14402. namespace Vc_VERSIONED_NAMESPACE
  14403. {
  14404. namespace AVX
  14405. {
  14406. template<typename T> struct IndexesFromZeroData;
  14407. template<> struct IndexesFromZeroData<int> {
  14408. static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
  14409. };
  14410. template<> struct IndexesFromZeroData<unsigned int> {
  14411. static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
  14412. };
  14413. template<> struct IndexesFromZeroData<short> {
  14414. static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
  14415. };
  14416. template<> struct IndexesFromZeroData<unsigned short> {
  14417. static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
  14418. };
  14419. template<> struct IndexesFromZeroData<signed char> {
  14420. static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
  14421. };
  14422. template<> struct IndexesFromZeroData<char> {
  14423. static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
  14424. };
  14425. template<> struct IndexesFromZeroData<unsigned char> {
  14426. static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
  14427. };
  14428. template<typename _T> struct Const
  14429. {
  14430. typedef Vector<_T> V;
  14431. typedef typename V::EntryType T;
  14432. typedef typename V::Mask M;
  14433. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig<T>::data[0]); }
  14434. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig<T>::data[1]); }
  14435. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig<T>::data[2]); }
  14436. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig<T>::data[3]); }
  14437. static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig<T>::data[4]); }
  14438. static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig<T>::data[5]); }
  14439. static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig<T>::data[(12 + i)]); }
  14440. static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig<T>::data[(17 + i)]); }
  14441. static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig<T>::data[22]); }
  14442. static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig<T>::data[23]); }
  14443. static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig<T>::data[24]); }
  14444. static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig<T>::data[8]); }
  14445. static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig<T>::data[9]); }
  14446. static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig<T>::data[10]); }
  14447. static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig<T>::data[11]); }
  14448. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(28 + i)]); }
  14449. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(33 + i)]); }
  14450. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(37 + i)]); }
  14451. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(43 + i)]); }
  14452. static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig<T>::data[25]); }
  14453. static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig<T>::data[26]); }
  14454. static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
  14455. static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log<T>::d(18)); }
  14456. static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log<T>::d(15)); }
  14457. static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log<T>::d(2 + i)); }
  14458. static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log<T>::d(8 + i)); }
  14459. static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log<T>::d(14)); }
  14460. static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log<T>::d(17)); }
  14461. static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log<T>::d(16)); }
  14462. static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log<T>::d(13)); }
  14463. static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log<T>::d(19)); }
  14464. static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log<T>::d(20)); }
  14465. static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
  14466. static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
  14467. };
  14468. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
  14469. {
  14470. return _mm256_broadcast_ss(
  14471. reinterpret_cast<const float *>(&c_general::highMaskFloat));
  14472. }
  14473. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
  14474. {
  14475. return _mm256_broadcast_sd(
  14476. reinterpret_cast<const double *>(&c_general::highMaskDouble));
  14477. }
  14478. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
  14479. {
  14480. #ifdef Vc_IMPL_AVX2
  14481. #if defined Vc_ICC || defined Vc_MSVC
  14482. __m256i allone;
  14483. allone = _mm256_cmpeq_epi8(allone, allone);
  14484. #else
  14485. auto allone = ~__m256i();
  14486. #endif
  14487. return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits));
  14488. #else
  14489. __m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
  14490. return concat(tmp, tmp);
  14491. #endif
  14492. }
  14493. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
  14494. {
  14495. #ifdef Vc_IMPL_AVX2
  14496. #if defined Vc_ICC || defined Vc_MSVC
  14497. __m256i allone;
  14498. allone = _mm256_cmpeq_epi8(allone, allone);
  14499. #else
  14500. auto allone = ~__m256i();
  14501. #endif
  14502. return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits));
  14503. #else
  14504. __m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
  14505. return concat(tmp, tmp);
  14506. #endif
  14507. }
  14508. }
  14509. namespace AVX2
  14510. {
  14511. using AVX::IndexesFromZeroData;
  14512. using AVX::Const;
  14513. }
  14514. }
  14515. #endif
  14516. namespace Vc_VERSIONED_NAMESPACE
  14517. {
  14518. namespace Detail
  14519. {
  14520. Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
  14521. Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
  14522. Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
  14523. Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
  14524. Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
  14525. Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
  14526. Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
  14527. Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
  14528. Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
  14529. Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
  14530. Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
  14531. Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
  14532. #ifdef Vc_IMPL_AVX2
  14533. Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
  14534. Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
  14535. Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
  14536. Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
  14537. Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
  14538. Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
  14539. Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
  14540. Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
  14541. Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
  14542. Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
  14543. Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
  14544. Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
  14545. Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
  14546. Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
  14547. Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
  14548. Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
  14549. Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
  14550. Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
  14551. Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
  14552. Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
  14553. Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
  14554. Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
  14555. Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
  14556. Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
  14557. #endif
  14558. template <typename T>
  14559. Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14560. {
  14561. return xor_(a.data(), b.data());
  14562. }
  14563. template <typename T>
  14564. Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14565. {
  14566. return and_(a.data(), b.data());
  14567. }
  14568. template <typename T>
  14569. Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14570. {
  14571. return or_(a.data(), b.data());
  14572. }
  14573. template <typename T>
  14574. Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14575. {
  14576. return add(a.data(), b.data(), T());
  14577. }
  14578. template <typename T>
  14579. Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14580. {
  14581. return sub(a.data(), b.data(), T());
  14582. }
  14583. template <typename T>
  14584. Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14585. {
  14586. return mul(a.data(), b.data(), T());
  14587. }
  14588. template <typename T>
  14589. Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
  14590. {
  14591. return div(a.data(), b.data(), T());
  14592. }
  14593. Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
  14594. AVX2::Vector<ushort> b)
  14595. {
  14596. using namespace AVX;
  14597. const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
  14598. convert<ushort, float>(lo128(b.data())));
  14599. const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
  14600. convert<ushort, float>(hi128(b.data())));
  14601. const float_v threshold = 32767.f;
  14602. using Detail::operator>;
  14603. const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
  14604. ? convert<float, ushort>(lo)
  14605. : convert<float, short>(lo);
  14606. const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
  14607. ? convert<float, ushort>(hi)
  14608. : convert<float, short>(hi);
  14609. return concat(loShort, hiShort);
  14610. }
  14611. template <typename T>
  14612. Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
  14613. AVX2::Vector<T> a, AVX2::Vector<T> b)
  14614. {
  14615. return a - a / b * b;
  14616. }
  14617. }
  14618. template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
  14619. {
  14620. const auto tmp0 = gen(0);
  14621. const auto tmp1 = gen(1);
  14622. const auto tmp2 = gen(2);
  14623. const auto tmp3 = gen(3);
  14624. return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
  14625. }
  14626. template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
  14627. {
  14628. const auto tmp0 = gen(0);
  14629. const auto tmp1 = gen(1);
  14630. const auto tmp2 = gen(2);
  14631. const auto tmp3 = gen(3);
  14632. const auto tmp4 = gen(4);
  14633. const auto tmp5 = gen(5);
  14634. const auto tmp6 = gen(6);
  14635. const auto tmp7 = gen(7);
  14636. return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  14637. }
  14638. #ifdef Vc_IMPL_AVX2
  14639. template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
  14640. {
  14641. const auto tmp0 = gen(0);
  14642. const auto tmp1 = gen(1);
  14643. const auto tmp2 = gen(2);
  14644. const auto tmp3 = gen(3);
  14645. const auto tmp4 = gen(4);
  14646. const auto tmp5 = gen(5);
  14647. const auto tmp6 = gen(6);
  14648. const auto tmp7 = gen(7);
  14649. return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  14650. }
  14651. template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
  14652. {
  14653. const auto tmp0 = gen(0);
  14654. const auto tmp1 = gen(1);
  14655. const auto tmp2 = gen(2);
  14656. const auto tmp3 = gen(3);
  14657. const auto tmp4 = gen(4);
  14658. const auto tmp5 = gen(5);
  14659. const auto tmp6 = gen(6);
  14660. const auto tmp7 = gen(7);
  14661. return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  14662. }
  14663. template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
  14664. {
  14665. const auto tmp0 = gen(0);
  14666. const auto tmp1 = gen(1);
  14667. const auto tmp2 = gen(2);
  14668. const auto tmp3 = gen(3);
  14669. const auto tmp4 = gen(4);
  14670. const auto tmp5 = gen(5);
  14671. const auto tmp6 = gen(6);
  14672. const auto tmp7 = gen(7);
  14673. const auto tmp8 = gen(8);
  14674. const auto tmp9 = gen(9);
  14675. const auto tmp10 = gen(10);
  14676. const auto tmp11 = gen(11);
  14677. const auto tmp12 = gen(12);
  14678. const auto tmp13 = gen(13);
  14679. const auto tmp14 = gen(14);
  14680. const auto tmp15 = gen(15);
  14681. return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
  14682. }
  14683. template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
  14684. {
  14685. const auto tmp0 = gen(0);
  14686. const auto tmp1 = gen(1);
  14687. const auto tmp2 = gen(2);
  14688. const auto tmp3 = gen(3);
  14689. const auto tmp4 = gen(4);
  14690. const auto tmp5 = gen(5);
  14691. const auto tmp6 = gen(6);
  14692. const auto tmp7 = gen(7);
  14693. const auto tmp8 = gen(8);
  14694. const auto tmp9 = gen(9);
  14695. const auto tmp10 = gen(10);
  14696. const auto tmp11 = gen(11);
  14697. const auto tmp12 = gen(12);
  14698. const auto tmp13 = gen(13);
  14699. const auto tmp14 = gen(14);
  14700. const auto tmp15 = gen(15);
  14701. return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
  14702. }
  14703. #endif
  14704. template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
  14705. template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
  14706. template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
  14707. #ifdef Vc_IMPL_AVX2
  14708. template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
  14709. template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
  14710. template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
  14711. template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
  14712. template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
  14713. template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
  14714. #endif
  14715. template <typename T>
  14716. Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
  14717. VectorSpecialInitializerIndexesFromZero)
  14718. : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
  14719. {
  14720. }
  14721. template <>
  14722. Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
  14723. : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
  14724. {
  14725. }
  14726. template <>
  14727. Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
  14728. : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
  14729. {
  14730. }
  14731. template <typename DstT>
  14732. template <typename SrcT, typename Flags>
  14733. Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
  14734. #ifndef Vc_MSVC
  14735. template
  14736. #endif
  14737. load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
  14738. {
  14739. Common::handleLoadPrefetches(mem, flags);
  14740. d.v() = Detail::load<VectorType, DstT>(mem, flags);
  14741. }
  14742. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
  14743. {
  14744. data() = Detail::zero<VectorType>();
  14745. }
  14746. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
  14747. {
  14748. data() = Detail::andnot_(k.data(), data());
  14749. }
  14750. template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
  14751. {
  14752. data() = Detail::and_(k.data(), data());
  14753. }
  14754. template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
  14755. {
  14756. data() = Detail::allone<VectorType>();
  14757. }
  14758. template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
  14759. {
  14760. data() = _mm256_or_pd(data(), k.dataD());
  14761. }
  14762. template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
  14763. {
  14764. data() = Detail::allone<VectorType>();
  14765. }
  14766. template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
  14767. {
  14768. data() = _mm256_or_ps(data(), k.dataF());
  14769. }
  14770. template <typename T>
  14771. template <typename U,
  14772. typename Flags,
  14773. typename>
  14774. Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
  14775. {
  14776. Common::handleStorePrefetches(mem, flags);
  14777. HV::template store<Flags>(mem, data());
  14778. }
  14779. template <typename T>
  14780. template <typename U,
  14781. typename Flags,
  14782. typename>
  14783. Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
  14784. {
  14785. Common::handleStorePrefetches(mem, flags);
  14786. HV::template store<Flags>(mem, data(), mask.data());
  14787. }
  14788. #ifdef Vc_IMPL_AVX2
  14789. template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
  14790. template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
  14791. template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
  14792. template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
  14793. template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
  14794. template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
  14795. template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
  14796. template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
  14797. template <typename T>
  14798. Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
  14799. {
  14800. static_assert(std::is_integral<T>::value,
  14801. "bitwise-operators can only be used with Vectors of integral type");
  14802. return *this = *this << x;
  14803. }
  14804. template <typename T>
  14805. Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
  14806. {
  14807. static_assert(std::is_integral<T>::value,
  14808. "bitwise-operators can only be used with Vectors of integral type");
  14809. return *this = *this >> x;
  14810. }
  14811. #endif
  14812. template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
  14813. d.v() = Detail::shiftRight(d.v(), shift, T());
  14814. return *static_cast<AVX2::Vector<T> *>(this);
  14815. }
  14816. template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
  14817. return Detail::shiftRight(d.v(), shift, T());
  14818. }
  14819. template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
  14820. d.v() = Detail::shiftLeft(d.v(), shift, T());
  14821. return *static_cast<AVX2::Vector<T> *>(this);
  14822. }
  14823. template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
  14824. return Detail::shiftLeft(d.v(), shift, T());
  14825. }
  14826. Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
  14827. {
  14828. return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
  14829. AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
  14830. }
  14831. Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
  14832. {
  14833. return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
  14834. AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
  14835. }
  14836. #define Vc_GATHER_IMPL(V_) \
  14837. template <> \
  14838. template <class MT, class IT, int Scale> \
  14839. inline void AVX2::V_::gatherImplementation( \
  14840. const Common::GatherArguments<MT, IT, Scale> &args)
  14841. #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
  14842. Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
  14843. Vc_GATHER_IMPL(float_v)
  14844. {
  14845. d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
  14846. Vc_M(7));
  14847. }
  14848. #ifdef Vc_IMPL_AVX2
  14849. Vc_GATHER_IMPL(int_v)
  14850. {
  14851. d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
  14852. Vc_M(6), Vc_M(7));
  14853. }
  14854. Vc_GATHER_IMPL(uint_v)
  14855. {
  14856. d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
  14857. Vc_M(6), Vc_M(7));
  14858. }
  14859. Vc_GATHER_IMPL(short_v)
  14860. {
  14861. d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
  14862. Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
  14863. Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
  14864. }
  14865. Vc_GATHER_IMPL(ushort_v)
  14866. {
  14867. d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
  14868. Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
  14869. Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
  14870. }
  14871. #endif
  14872. #undef Vc_M
  14873. #undef Vc_GATHER_IMPL
  14874. template <class T>
  14875. template <class MT, class IT, int Scale>
  14876. inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
  14877. const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
  14878. {
  14879. const auto *mem = args.address;
  14880. const auto indexes = Scale * args.indexes;
  14881. using Selector = std::integral_constant < Common::GatherScatterImplementation,
  14882. #ifdef Vc_USE_SET_GATHERS
  14883. Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
  14884. #endif
  14885. #ifdef Vc_USE_BSF_GATHERS
  14886. Common::GatherScatterImplementation::BitScanLoop
  14887. #elif defined Vc_USE_POPCNT_BSF_GATHERS
  14888. Common::GatherScatterImplementation::PopcntSwitch
  14889. #else
  14890. Common::GatherScatterImplementation::SimpleLoop
  14891. #endif
  14892. > ;
  14893. Common::executeGather(Selector(), *this, mem, indexes, mask);
  14894. }
  14895. template <typename T>
  14896. template <typename MT, typename IT>
  14897. inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
  14898. {
  14899. Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
  14900. }
  14901. template <typename T>
  14902. template <typename MT, typename IT>
  14903. inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
  14904. {
  14905. using Selector = std::integral_constant < Common::GatherScatterImplementation,
  14906. #ifdef Vc_USE_SET_GATHERS
  14907. Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
  14908. #endif
  14909. #ifdef Vc_USE_BSF_GATHERS
  14910. Common::GatherScatterImplementation::BitScanLoop
  14911. #elif defined Vc_USE_POPCNT_BSF_GATHERS
  14912. Common::GatherScatterImplementation::PopcntSwitch
  14913. #else
  14914. Common::GatherScatterImplementation::SimpleLoop
  14915. #endif
  14916. > ;
  14917. Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
  14918. }
  14919. #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
  14920. template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
  14921. {
  14922. return VectorType(-d.builtin());
  14923. }
  14924. #else
  14925. template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
  14926. {
  14927. return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
  14928. }
  14929. #endif
  14930. template <typename T>
  14931. Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
  14932. Vector<T, VectorAbi::Avx>::minIndex() const
  14933. {
  14934. AVX2::Vector<T> x = min();
  14935. return std::make_pair(x, (*this == x).firstOne());
  14936. }
  14937. template <typename T>
  14938. Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
  14939. Vector<T, VectorAbi::Avx>::maxIndex() const
  14940. {
  14941. AVX2::Vector<T> x = max();
  14942. return std::make_pair(x, (*this == x).firstOne());
  14943. }
  14944. template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
  14945. {
  14946. __m256 x = d.v();
  14947. __m256 idx = Vector<float>::IndexesFromZero().data();
  14948. __m256 y = Mem::permute128<X1, X0>(x);
  14949. __m256 idy = Mem::permute128<X1, X0>(idx);
  14950. __m256 less = AVX::cmplt_ps(x, y);
  14951. x = _mm256_blendv_ps(y, x, less);
  14952. idx = _mm256_blendv_ps(idy, idx, less);
  14953. y = Reg::permute<X2, X3, X0, X1>(x);
  14954. idy = Reg::permute<X2, X3, X0, X1>(idx);
  14955. less = AVX::cmplt_ps(x, y);
  14956. x = _mm256_blendv_ps(y, x, less);
  14957. idx = _mm256_blendv_ps(idy, idx, less);
  14958. y = Reg::permute<X1, X0, X3, X2>(x);
  14959. idy = Reg::permute<X1, X0, X3, X2>(idx);
  14960. less = AVX::cmplt_ps(x, y);
  14961. idx = _mm256_blendv_ps(idy, idx, less);
  14962. const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
  14963. #ifdef Vc_GNU_ASM
  14964. __asm__ __volatile__("");
  14965. #endif
  14966. x = _mm256_blendv_ps(y, x, less);
  14967. return std::make_pair(x, index);
  14968. }
  14969. template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
  14970. {
  14971. AVX2::Vector<T> tmp = *this;
  14972. if (Size > 1) tmp += tmp.shifted(-1);
  14973. if (Size > 2) tmp += tmp.shifted(-2);
  14974. if (Size > 4) tmp += tmp.shifted(-4);
  14975. if (Size > 8) tmp += tmp.shifted(-8);
  14976. if (Size > 16) tmp += tmp.shifted(-16);
  14977. return tmp;
  14978. }
  14979. template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
  14980. {
  14981. AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
  14982. tmp(m) = *this;
  14983. return tmp.min();
  14984. }
  14985. template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
  14986. {
  14987. AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
  14988. tmp(m) = *this;
  14989. return tmp.max();
  14990. }
  14991. template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
  14992. {
  14993. AVX2::Vector<T> tmp(Vc::One);
  14994. tmp(m) = *this;
  14995. return tmp.product();
  14996. }
  14997. template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
  14998. {
  14999. AVX2::Vector<T> tmp(Vc::Zero);
  15000. tmp(m) = *this;
  15001. return tmp.sum();
  15002. }
  15003. namespace Detail
  15004. {
  15005. Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
  15006. {
  15007. using namespace AVX;
  15008. __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
  15009. __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
  15010. tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
  15011. tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
  15012. return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
  15013. }
  15014. Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
  15015. {
  15016. using namespace AVX;
  15017. __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
  15018. __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
  15019. tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
  15020. tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
  15021. return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
  15022. }
  15023. }
  15024. Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
  15025. {
  15026. using Detail::operator>=;
  15027. Vc_ASSERT((x >= x.Zero()).isFull());
  15028. return Detail::exponent(x.data());
  15029. }
  15030. Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
  15031. {
  15032. using Detail::operator>=;
  15033. Vc_ASSERT((x >= x.Zero()).isFull());
  15034. return Detail::exponent(x.data());
  15035. }
  15036. static Vc_ALWAYS_INLINE __m256i _doRandomStep()
  15037. {
  15038. using Detail::operator*;
  15039. using Detail::operator+;
  15040. #ifdef Vc_IMPL_AVX2
  15041. using AVX2::uint_v;
  15042. uint_v state0(&Common::RandomState[0]);
  15043. uint_v state1(&Common::RandomState[uint_v::Size]);
  15044. (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
  15045. uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
  15046. _mm256_srli_epi32(state1.data(), 16)))
  15047. .store(&Common::RandomState[0]);
  15048. return state0.data();
  15049. #else
  15050. using SSE::uint_v;
  15051. uint_v state0(&Common::RandomState[0]);
  15052. uint_v state1(&Common::RandomState[uint_v::Size]);
  15053. uint_v state2(&Common::RandomState[2 * uint_v::Size]);
  15054. uint_v state3(&Common::RandomState[3 * uint_v::Size]);
  15055. (state2 * uint_v(0xdeece66du) + uint_v(11))
  15056. .store(&Common::RandomState[2 * uint_v::Size]);
  15057. (state3 * uint_v(0xdeece66du) + uint_v(11))
  15058. .store(&Common::RandomState[3 * uint_v::Size]);
  15059. uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
  15060. _mm_srli_epi32(state2.data(), 16)))
  15061. .store(&Common::RandomState[0]);
  15062. uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
  15063. _mm_srli_epi32(state3.data(), 16)))
  15064. .store(&Common::RandomState[uint_v::Size]);
  15065. return AVX::concat(state0.data(), state1.data());
  15066. #endif
  15067. }
  15068. #ifdef Vc_IMPL_AVX2
  15069. template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
  15070. {
  15071. return {_doRandomStep()};
  15072. }
  15073. #endif
  15074. template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
  15075. {
  15076. return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
  15077. HT::one());
  15078. }
  15079. template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
  15080. {
  15081. const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
  15082. Detail::LoadTag<__m256i, int>());
  15083. for (size_t k = 0; k < 8; k += 2) {
  15084. typedef unsigned long long uint64 Vc_MAY_ALIAS;
  15085. const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
  15086. *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
  15087. }
  15088. return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
  15089. }
  15090. template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
  15091. {
  15092. return Detail::shifted<EntryType>(d.v(), amount);
  15093. }
  15094. template <typename VectorType>
  15095. Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
  15096. {
  15097. return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
  15098. }
  15099. template <typename VectorType>
  15100. Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
  15101. {
  15102. return Mem::shuffle128<X1, Y0>(left, right);
  15103. }
  15104. template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
  15105. {
  15106. #ifdef __GNUC__
  15107. if (__builtin_constant_p(amount)) {
  15108. const __m256i a = AVX::avx_cast<__m256i>(d.v());
  15109. const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
  15110. if (amount * 2 == int(Size)) {
  15111. return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
  15112. }
  15113. if (amount * 2 == -int(Size)) {
  15114. return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
  15115. }
  15116. switch (amount) {
  15117. case 1:
  15118. return AVX::avx_cast<VectorType>(
  15119. #ifdef Vc_IMPL_AVX2
  15120. _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
  15121. sizeof(EntryType))
  15122. #else
  15123. AVX::concat(
  15124. _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
  15125. _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
  15126. #endif
  15127. );
  15128. case 2:
  15129. return AVX::avx_cast<VectorType>(
  15130. #ifdef Vc_IMPL_AVX2
  15131. _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
  15132. 2 * sizeof(EntryType))
  15133. #else
  15134. AVX::concat(
  15135. _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
  15136. _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
  15137. #endif
  15138. );
  15139. case 3:
  15140. if (6u < Size) {
  15141. return AVX::avx_cast<VectorType>(
  15142. #ifdef Vc_IMPL_AVX2
  15143. _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
  15144. 3 * sizeof(EntryType))
  15145. #else
  15146. AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
  15147. 3 * sizeof(EntryType)),
  15148. _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
  15149. 3 * sizeof(EntryType)))
  15150. #endif
  15151. );
  15152. }
  15153. }
  15154. }
  15155. #endif
  15156. using Detail::operator|;
  15157. return shifted(amount) | (amount > 0 ?
  15158. shiftIn.shifted(amount - Size) :
  15159. shiftIn.shifted(Size + amount));
  15160. }
  15161. template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
  15162. {
  15163. return Detail::rotated<EntryType, size()>(d.v(), amount);
  15164. }
  15165. template <typename T>
  15166. Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
  15167. const
  15168. {
  15169. return Detail::sorted(*this);
  15170. }
  15171. template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
  15172. {
  15173. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
  15174. _mm256_unpackhi_pd(data(), x.data()));
  15175. }
  15176. template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
  15177. {
  15178. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
  15179. _mm256_unpackhi_pd(data(), x.data()));
  15180. }
  15181. template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
  15182. {
  15183. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
  15184. _mm256_unpackhi_ps(data(), x.data()));
  15185. }
  15186. template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
  15187. {
  15188. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
  15189. _mm256_unpackhi_ps(data(), x.data()));
  15190. }
  15191. #ifdef Vc_IMPL_AVX2
  15192. template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
  15193. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
  15194. _mm256_unpackhi_epi32(data(), x.data()));
  15195. }
  15196. template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
  15197. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
  15198. _mm256_unpackhi_epi32(data(), x.data()));
  15199. }
  15200. template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
  15201. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
  15202. _mm256_unpackhi_epi32(data(), x.data()));
  15203. }
  15204. template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
  15205. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
  15206. _mm256_unpackhi_epi32(data(), x.data()));
  15207. }
  15208. template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
  15209. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
  15210. _mm256_unpackhi_epi16(data(), x.data()));
  15211. }
  15212. template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
  15213. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
  15214. _mm256_unpackhi_epi16(data(), x.data()));
  15215. }
  15216. template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
  15217. return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
  15218. _mm256_unpackhi_epi16(data(), x.data()));
  15219. }
  15220. template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
  15221. return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
  15222. _mm256_unpackhi_epi16(data(), x.data()));
  15223. }
  15224. #endif
  15225. template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
  15226. {
  15227. return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
  15228. }
  15229. template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
  15230. {
  15231. return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
  15232. }
  15233. #ifdef Vc_IMPL_AVX2
  15234. template <>
  15235. Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
  15236. {
  15237. return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
  15238. }
  15239. template <>
  15240. Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
  15241. {
  15242. return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
  15243. }
  15244. template <>
  15245. Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
  15246. Permutation::ReversedTag) const
  15247. {
  15248. return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
  15249. AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
  15250. AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
  15251. }
  15252. template <>
  15253. Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
  15254. Permutation::ReversedTag) const
  15255. {
  15256. return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
  15257. AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
  15258. AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
  15259. }
  15260. #endif
  15261. template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType & ) const
  15262. {
  15263. return *this;
  15264. #ifdef Vc_IMPL_AVX2
  15265. #else
  15266. #endif
  15267. }
  15268. template <typename T>
  15269. Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
  15270. {
  15271. return (*this)[Permutation::Reversed];
  15272. }
  15273. template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
  15274. {
  15275. constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
  15276. constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
  15277. return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
  15278. }
  15279. template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
  15280. {
  15281. constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
  15282. constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
  15283. return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
  15284. }
  15285. }
  15286. #ifndef VC_AVX_SIMD_CAST_H_
  15287. #define VC_AVX_SIMD_CAST_H_
  15288. #ifndef VC_AVX_VECTOR_H_
  15289. #error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h"
  15290. #endif
  15291. namespace Vc_VERSIONED_NAMESPACE
  15292. {
  15293. #define Vc_SIMD_CAST_AVX_1(from_,to_) \
  15294. template <typename To> \
  15295. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15296. AVX2::from_ x, enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
  15297. #define Vc_SIMD_CAST_AVX_2(from_,to_) \
  15298. template <typename To> \
  15299. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15300. AVX2::from_ x0, AVX2::from_ x1, \
  15301. enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
  15302. #define Vc_SIMD_CAST_AVX_3(from_,to_) \
  15303. template <typename To> \
  15304. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15305. AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
  15306. enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
  15307. #define Vc_SIMD_CAST_AVX_4(from_,to_) \
  15308. template <typename To> \
  15309. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15310. AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \
  15311. enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
  15312. #define Vc_SIMD_CAST_1(from_,to_) \
  15313. template <typename To> \
  15314. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15315. from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
  15316. #define Vc_SIMD_CAST_2(from_,to_) \
  15317. template <typename To> \
  15318. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15319. from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
  15320. #define Vc_SIMD_CAST_3(from_,to_) \
  15321. template <typename To> \
  15322. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15323. from_ x0, from_ x1, from_ x2, enable_if<std::is_same<To, to_>::value> = nullarg)
  15324. #define Vc_SIMD_CAST_4(from_,to_) \
  15325. template <typename To> \
  15326. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15327. from_ x0, from_ x1, from_ x2, from_ x3, \
  15328. enable_if<std::is_same<To, to_>::value> = nullarg)
  15329. #define Vc_SIMD_CAST_5(from_,to_) \
  15330. template <typename To> \
  15331. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15332. from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  15333. enable_if<std::is_same<To, to_>::value> = nullarg)
  15334. #define Vc_SIMD_CAST_6(from_,to_) \
  15335. template <typename To> \
  15336. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15337. from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \
  15338. enable_if<std::is_same<To, to_>::value> = nullarg)
  15339. #define Vc_SIMD_CAST_7(from_,to_) \
  15340. template <typename To> \
  15341. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15342. from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \
  15343. enable_if<std::is_same<To, to_>::value> = nullarg)
  15344. #define Vc_SIMD_CAST_8(from_,to_) \
  15345. template <typename To> \
  15346. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15347. from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
  15348. enable_if<std::is_same<To, to_>::value> = nullarg)
  15349. #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
  15350. static_assert(from_::size() >= to_::size() * (offset_ + 1), \
  15351. "this offset cannot exist for this type combination"); \
  15352. template <typename To, int offset> \
  15353. Vc_INTRINSIC Vc_CONST To simd_cast( \
  15354. from_ x, \
  15355. enable_if<(offset == offset_ && std::is_same<To, to_>::value)> = nullarg)
  15356. template <typename To, typename From>
  15357. Vc_INTRINSIC Vc_CONST To
  15358. simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  15359. SSE::Vector<typename To::EntryType>::Size == To::Size)> =
  15360. nullarg);
  15361. template <typename To, typename From>
  15362. Vc_INTRINSIC Vc_CONST To simd_cast(
  15363. From x0, From x1,
  15364. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  15365. SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
  15366. template <typename To, typename From>
  15367. Vc_INTRINSIC Vc_CONST To simd_cast(
  15368. From x0, From x1, From x2,
  15369. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  15370. SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
  15371. template <typename To, typename From>
  15372. Vc_INTRINSIC Vc_CONST To simd_cast(
  15373. From x0, From x1, From x2, From x3,
  15374. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  15375. SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
  15376. template <typename To, typename From>
  15377. Vc_INTRINSIC Vc_CONST To simd_cast(
  15378. From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
  15379. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  15380. SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
  15381. Vc_SIMD_CAST_AVX_1( float_v, double_v);
  15382. Vc_SIMD_CAST_AVX_1(double_v, float_v);
  15383. Vc_SIMD_CAST_AVX_2(double_v, float_v);
  15384. #ifdef Vc_IMPL_AVX2
  15385. Vc_SIMD_CAST_AVX_1( int_v, double_v);
  15386. Vc_SIMD_CAST_AVX_1( uint_v, double_v);
  15387. Vc_SIMD_CAST_AVX_1( short_v, double_v);
  15388. Vc_SIMD_CAST_AVX_1(ushort_v, double_v);
  15389. Vc_SIMD_CAST_AVX_1( int_v, float_v);
  15390. Vc_SIMD_CAST_AVX_1( uint_v, float_v);
  15391. Vc_SIMD_CAST_AVX_1( short_v, float_v);
  15392. Vc_SIMD_CAST_AVX_1(ushort_v, float_v);
  15393. Vc_SIMD_CAST_AVX_1(double_v, int_v);
  15394. Vc_SIMD_CAST_AVX_1( float_v, int_v);
  15395. Vc_SIMD_CAST_AVX_1( uint_v, int_v);
  15396. Vc_SIMD_CAST_AVX_1( short_v, int_v);
  15397. Vc_SIMD_CAST_AVX_1(ushort_v, int_v);
  15398. Vc_SIMD_CAST_AVX_2(double_v, int_v);
  15399. Vc_SIMD_CAST_AVX_1(double_v, uint_v);
  15400. Vc_SIMD_CAST_AVX_1( float_v, uint_v);
  15401. Vc_SIMD_CAST_AVX_1( int_v, uint_v);
  15402. Vc_SIMD_CAST_AVX_1( short_v, uint_v);
  15403. Vc_SIMD_CAST_AVX_1(ushort_v, uint_v);
  15404. Vc_SIMD_CAST_AVX_2(double_v, uint_v);
  15405. Vc_SIMD_CAST_AVX_1(double_v, short_v);
  15406. Vc_SIMD_CAST_AVX_1( float_v, short_v);
  15407. Vc_SIMD_CAST_AVX_1( int_v, short_v);
  15408. Vc_SIMD_CAST_AVX_1( uint_v, short_v);
  15409. Vc_SIMD_CAST_AVX_1(ushort_v, short_v);
  15410. Vc_SIMD_CAST_AVX_2(double_v, short_v);
  15411. Vc_SIMD_CAST_AVX_2( float_v, short_v);
  15412. Vc_SIMD_CAST_AVX_2( int_v, short_v);
  15413. Vc_SIMD_CAST_AVX_2( uint_v, short_v);
  15414. Vc_SIMD_CAST_AVX_3(double_v, short_v);
  15415. Vc_SIMD_CAST_AVX_4(double_v, short_v);
  15416. Vc_SIMD_CAST_AVX_1(double_v, ushort_v);
  15417. Vc_SIMD_CAST_AVX_1( float_v, ushort_v);
  15418. Vc_SIMD_CAST_AVX_1( int_v, ushort_v);
  15419. Vc_SIMD_CAST_AVX_1( uint_v, ushort_v);
  15420. Vc_SIMD_CAST_AVX_1( short_v, ushort_v);
  15421. Vc_SIMD_CAST_AVX_2(double_v, ushort_v);
  15422. Vc_SIMD_CAST_AVX_2( float_v, ushort_v);
  15423. Vc_SIMD_CAST_AVX_2( int_v, ushort_v);
  15424. Vc_SIMD_CAST_AVX_2( uint_v, ushort_v);
  15425. Vc_SIMD_CAST_AVX_3(double_v, ushort_v);
  15426. Vc_SIMD_CAST_AVX_4(double_v, ushort_v);
  15427. #endif
  15428. Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v);
  15429. Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v);
  15430. Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v);
  15431. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v);
  15432. Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v);
  15433. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v);
  15434. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v);
  15435. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v);
  15436. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v);
  15437. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v);
  15438. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v);
  15439. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v);
  15440. #ifdef Vc_IMPL_AVX2
  15441. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v);
  15442. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v);
  15443. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v);
  15444. Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v);
  15445. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v);
  15446. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v);
  15447. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v);
  15448. Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v);
  15449. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v);
  15450. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v);
  15451. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v);
  15452. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v);
  15453. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v);
  15454. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v);
  15455. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v);
  15456. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v);
  15457. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v);
  15458. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v);
  15459. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v);
  15460. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v);
  15461. Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v);
  15462. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v);
  15463. Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v);
  15464. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v);
  15465. #endif
  15466. Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v);
  15467. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v);
  15468. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v);
  15469. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v);
  15470. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v);
  15471. #ifdef Vc_IMPL_AVX2
  15472. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v);
  15473. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v);
  15474. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v);
  15475. Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v);
  15476. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v);
  15477. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v);
  15478. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v);
  15479. Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v);
  15480. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v);
  15481. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v);
  15482. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v);
  15483. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v);
  15484. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v);
  15485. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v);
  15486. Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v);
  15487. Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v);
  15488. Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v);
  15489. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v);
  15490. Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v);
  15491. Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v);
  15492. #endif
  15493. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v);
  15494. #ifdef Vc_IMPL_AVX2
  15495. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v);
  15496. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v);
  15497. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v);
  15498. Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v);
  15499. Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v);
  15500. Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v);
  15501. Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v);
  15502. Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v);
  15503. Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v);
  15504. Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v);
  15505. #endif
  15506. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v);
  15507. #ifdef Vc_IMPL_AVX2
  15508. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v);
  15509. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v);
  15510. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v);
  15511. Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v);
  15512. Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v);
  15513. Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v);
  15514. Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v);
  15515. Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v);
  15516. Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v);
  15517. Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v);
  15518. #endif
  15519. #ifdef Vc_IMPL_AVX2
  15520. Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v);
  15521. Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v);
  15522. #endif
  15523. #ifdef Vc_IMPL_AVX2
  15524. Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v);
  15525. Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v);
  15526. #endif
  15527. #ifdef Vc_IMPL_AVX2
  15528. Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v);
  15529. Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v);
  15530. #endif
  15531. #ifdef Vc_IMPL_AVX2
  15532. Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v);
  15533. Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v);
  15534. #endif
  15535. Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v);
  15536. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v);
  15537. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v);
  15538. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v);
  15539. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v);
  15540. Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v);
  15541. Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v);
  15542. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v);
  15543. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v);
  15544. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v);
  15545. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v);
  15546. Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v);
  15547. #ifdef Vc_IMPL_AVX2
  15548. Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v);
  15549. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v);
  15550. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v);
  15551. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v);
  15552. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v);
  15553. Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v);
  15554. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v);
  15555. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v);
  15556. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v);
  15557. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v);
  15558. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v);
  15559. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v);
  15560. Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v);
  15561. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v);
  15562. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v);
  15563. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v);
  15564. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v);
  15565. Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v);
  15566. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v);
  15567. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v);
  15568. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v);
  15569. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v);
  15570. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v);
  15571. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v);
  15572. #endif
  15573. Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v);
  15574. Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v);
  15575. template <typename Return, typename T>
  15576. Vc_INTRINSIC Vc_CONST Return
  15577. simd_cast(Scalar::Vector<T> x,
  15578. enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
  15579. template <typename Return, typename T>
  15580. Vc_INTRINSIC Vc_CONST Return
  15581. simd_cast(Scalar::Vector<T> x,
  15582. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15583. #ifdef Vc_IMPL_AVX2
  15584. template <typename Return, typename T>
  15585. Vc_INTRINSIC Vc_CONST Return
  15586. simd_cast(Scalar::Vector<T> x,
  15587. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15588. template <typename Return, typename T>
  15589. Vc_INTRINSIC Vc_CONST Return
  15590. simd_cast(Scalar::Vector<T> x,
  15591. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15592. template <typename Return, typename T>
  15593. Vc_INTRINSIC Vc_CONST Return
  15594. simd_cast(Scalar::Vector<T> x,
  15595. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15596. template <typename Return, typename T>
  15597. Vc_INTRINSIC Vc_CONST Return
  15598. simd_cast(Scalar::Vector<T> x,
  15599. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15600. #endif
  15601. template <typename Return, typename T>
  15602. Vc_INTRINSIC Vc_CONST Return
  15603. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15604. enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
  15605. template <typename Return, typename T>
  15606. Vc_INTRINSIC Vc_CONST Return
  15607. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15608. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15609. #ifdef Vc_IMPL_AVX2
  15610. template <typename Return, typename T>
  15611. Vc_INTRINSIC Vc_CONST Return
  15612. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15613. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15614. template <typename Return, typename T>
  15615. Vc_INTRINSIC Vc_CONST Return
  15616. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15617. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15618. template <typename Return, typename T>
  15619. Vc_INTRINSIC Vc_CONST Return
  15620. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15621. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15622. template <typename Return, typename T>
  15623. Vc_INTRINSIC Vc_CONST Return
  15624. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  15625. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15626. #endif
  15627. template <typename Return, typename T>
  15628. Vc_INTRINSIC Vc_CONST Return
  15629. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15630. enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
  15631. template <typename Return, typename T>
  15632. Vc_INTRINSIC Vc_CONST Return
  15633. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15634. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15635. #ifdef Vc_IMPL_AVX2
  15636. template <typename Return, typename T>
  15637. Vc_INTRINSIC Vc_CONST Return
  15638. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15639. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15640. template <typename Return, typename T>
  15641. Vc_INTRINSIC Vc_CONST Return
  15642. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15643. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15644. template <typename Return, typename T>
  15645. Vc_INTRINSIC Vc_CONST Return
  15646. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15647. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15648. template <typename Return, typename T>
  15649. Vc_INTRINSIC Vc_CONST Return
  15650. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15651. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15652. #endif
  15653. template <typename Return, typename T>
  15654. Vc_INTRINSIC Vc_CONST Return
  15655. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15656. Scalar::Vector<T> x3,
  15657. enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
  15658. template <typename Return, typename T>
  15659. Vc_INTRINSIC Vc_CONST Return
  15660. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15661. Scalar::Vector<T> x3,
  15662. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15663. #ifdef Vc_IMPL_AVX2
  15664. template <typename Return, typename T>
  15665. Vc_INTRINSIC Vc_CONST Return
  15666. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15667. Scalar::Vector<T> x3,
  15668. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15669. template <typename Return, typename T>
  15670. Vc_INTRINSIC Vc_CONST Return
  15671. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15672. Scalar::Vector<T> x3,
  15673. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15674. template <typename Return, typename T>
  15675. Vc_INTRINSIC Vc_CONST Return
  15676. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15677. Scalar::Vector<T> x3,
  15678. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15679. template <typename Return, typename T>
  15680. Vc_INTRINSIC Vc_CONST Return
  15681. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15682. Scalar::Vector<T> x3,
  15683. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15684. #endif
  15685. template <typename Return, typename T>
  15686. Vc_INTRINSIC Vc_CONST Return
  15687. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15688. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  15689. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15690. #ifdef Vc_IMPL_AVX2
  15691. template <typename Return, typename T>
  15692. Vc_INTRINSIC Vc_CONST Return
  15693. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15694. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  15695. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15696. template <typename Return, typename T>
  15697. Vc_INTRINSIC Vc_CONST Return
  15698. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15699. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  15700. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15701. template <typename Return, typename T>
  15702. Vc_INTRINSIC Vc_CONST Return
  15703. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15704. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  15705. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15706. template <typename Return, typename T>
  15707. Vc_INTRINSIC Vc_CONST Return
  15708. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15709. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  15710. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15711. #endif
  15712. template <typename Return, typename T>
  15713. Vc_INTRINSIC Vc_CONST Return
  15714. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15715. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15716. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15717. #ifdef Vc_IMPL_AVX2
  15718. template <typename Return, typename T>
  15719. Vc_INTRINSIC Vc_CONST Return
  15720. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15721. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15722. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15723. template <typename Return, typename T>
  15724. Vc_INTRINSIC Vc_CONST Return
  15725. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15726. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15727. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15728. template <typename Return, typename T>
  15729. Vc_INTRINSIC Vc_CONST Return
  15730. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15731. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15732. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15733. template <typename Return, typename T>
  15734. Vc_INTRINSIC Vc_CONST Return
  15735. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15736. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15737. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15738. #endif
  15739. template <typename Return, typename T>
  15740. Vc_INTRINSIC Vc_CONST Return
  15741. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15742. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15743. Scalar::Vector<T> x6,
  15744. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15745. #ifdef Vc_IMPL_AVX2
  15746. template <typename Return, typename T>
  15747. Vc_INTRINSIC Vc_CONST Return
  15748. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15749. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15750. Scalar::Vector<T> x6,
  15751. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15752. template <typename Return, typename T>
  15753. Vc_INTRINSIC Vc_CONST Return
  15754. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15755. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15756. Scalar::Vector<T> x6,
  15757. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15758. template <typename Return, typename T>
  15759. Vc_INTRINSIC Vc_CONST Return
  15760. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15761. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15762. Scalar::Vector<T> x6,
  15763. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15764. template <typename Return, typename T>
  15765. Vc_INTRINSIC Vc_CONST Return
  15766. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15767. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15768. Scalar::Vector<T> x6,
  15769. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15770. #endif
  15771. template <typename Return, typename T>
  15772. Vc_INTRINSIC Vc_CONST Return
  15773. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15774. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15775. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  15776. enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
  15777. #ifdef Vc_IMPL_AVX2
  15778. template <typename Return, typename T>
  15779. Vc_INTRINSIC Vc_CONST Return
  15780. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15781. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15782. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  15783. enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
  15784. template <typename Return, typename T>
  15785. Vc_INTRINSIC Vc_CONST Return
  15786. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15787. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15788. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  15789. enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
  15790. template <typename Return, typename T>
  15791. Vc_INTRINSIC Vc_CONST Return
  15792. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15793. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15794. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  15795. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15796. template <typename Return, typename T>
  15797. Vc_INTRINSIC Vc_CONST Return
  15798. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15799. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15800. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  15801. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15802. #endif
  15803. #ifdef Vc_IMPL_AVX2
  15804. template <typename Return, typename T>
  15805. Vc_INTRINSIC Vc_CONST Return
  15806. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15807. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15808. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15809. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15810. template <typename Return, typename T>
  15811. Vc_INTRINSIC Vc_CONST Return
  15812. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15813. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15814. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15815. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15816. #endif
  15817. #ifdef Vc_IMPL_AVX2
  15818. template <typename Return, typename T>
  15819. Vc_INTRINSIC Vc_CONST Return
  15820. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15821. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15822. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15823. Scalar::Vector<T> x9,
  15824. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15825. template <typename Return, typename T>
  15826. Vc_INTRINSIC Vc_CONST Return
  15827. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15828. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15829. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15830. Scalar::Vector<T> x9,
  15831. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15832. #endif
  15833. #ifdef Vc_IMPL_AVX2
  15834. template <typename Return, typename T>
  15835. Vc_INTRINSIC Vc_CONST Return
  15836. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15837. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15838. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15839. Scalar::Vector<T> x9, Scalar::Vector<T> x10,
  15840. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15841. template <typename Return, typename T>
  15842. Vc_INTRINSIC Vc_CONST Return
  15843. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15844. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15845. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15846. Scalar::Vector<T> x9, Scalar::Vector<T> x10,
  15847. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15848. #endif
  15849. #ifdef Vc_IMPL_AVX2
  15850. template <typename Return, typename T>
  15851. Vc_INTRINSIC Vc_CONST Return
  15852. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15853. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15854. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15855. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15856. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15857. template <typename Return, typename T>
  15858. Vc_INTRINSIC Vc_CONST Return
  15859. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15860. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15861. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15862. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15863. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15864. #endif
  15865. #ifdef Vc_IMPL_AVX2
  15866. template <typename Return, typename T>
  15867. Vc_INTRINSIC Vc_CONST Return
  15868. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15869. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15870. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15871. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15872. Scalar::Vector<T> x12,
  15873. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15874. template <typename Return, typename T>
  15875. Vc_INTRINSIC Vc_CONST Return
  15876. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15877. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15878. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15879. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15880. Scalar::Vector<T> x12,
  15881. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15882. #endif
  15883. #ifdef Vc_IMPL_AVX2
  15884. template <typename Return, typename T>
  15885. Vc_INTRINSIC Vc_CONST Return
  15886. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15887. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15888. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15889. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15890. Scalar::Vector<T> x12, Scalar::Vector<T> x13,
  15891. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15892. template <typename Return, typename T>
  15893. Vc_INTRINSIC Vc_CONST Return
  15894. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15895. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15896. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15897. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15898. Scalar::Vector<T> x12, Scalar::Vector<T> x13,
  15899. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15900. #endif
  15901. #ifdef Vc_IMPL_AVX2
  15902. template <typename Return, typename T>
  15903. Vc_INTRINSIC Vc_CONST Return
  15904. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15905. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15906. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15907. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15908. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  15909. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15910. template <typename Return, typename T>
  15911. Vc_INTRINSIC Vc_CONST Return
  15912. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15913. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15914. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15915. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15916. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  15917. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15918. #endif
  15919. #ifdef Vc_IMPL_AVX2
  15920. template <typename Return, typename T>
  15921. Vc_INTRINSIC Vc_CONST Return
  15922. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15923. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15924. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15925. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15926. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  15927. Scalar::Vector<T> x15,
  15928. enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
  15929. template <typename Return, typename T>
  15930. Vc_INTRINSIC Vc_CONST Return
  15931. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  15932. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  15933. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  15934. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  15935. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  15936. Scalar::Vector<T> x15,
  15937. enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
  15938. #endif
  15939. template <typename To, typename FromT>
  15940. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector<FromT> x,
  15941. enable_if<Scalar::is_vector<To>::value> = nullarg);
  15942. template <typename Return, typename T>
  15943. Vc_INTRINSIC Vc_CONST Return
  15944. simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value> = nullarg);
  15945. Vc_SIMD_CAST_AVX_2(double_m, float_m);
  15946. #ifdef Vc_IMPL_AVX2
  15947. Vc_SIMD_CAST_AVX_2(double_m, int_m);
  15948. Vc_SIMD_CAST_AVX_2(double_m, uint_m);
  15949. Vc_SIMD_CAST_AVX_2(double_m, short_m);
  15950. Vc_SIMD_CAST_AVX_2(double_m, ushort_m);
  15951. Vc_SIMD_CAST_AVX_2( float_m, short_m);
  15952. Vc_SIMD_CAST_AVX_2( float_m, ushort_m);
  15953. Vc_SIMD_CAST_AVX_2( int_m, short_m);
  15954. Vc_SIMD_CAST_AVX_2( int_m, ushort_m);
  15955. Vc_SIMD_CAST_AVX_2( uint_m, short_m);
  15956. Vc_SIMD_CAST_AVX_2( uint_m, ushort_m);
  15957. #endif
  15958. #ifdef Vc_IMPL_AVX2
  15959. Vc_SIMD_CAST_AVX_4(double_m, short_m);
  15960. Vc_SIMD_CAST_AVX_4(double_m, ushort_m);
  15961. #endif
  15962. Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m);
  15963. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m);
  15964. #ifdef Vc_IMPL_AVX2
  15965. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m);
  15966. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m);
  15967. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m);
  15968. Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m);
  15969. #endif
  15970. Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m);
  15971. Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m);
  15972. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m);
  15973. Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m);
  15974. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m);
  15975. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m);
  15976. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m);
  15977. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m);
  15978. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m);
  15979. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m);
  15980. #ifdef Vc_IMPL_AVX2
  15981. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m);
  15982. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m);
  15983. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m);
  15984. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m);
  15985. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m);
  15986. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m);
  15987. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m);
  15988. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m);
  15989. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m);
  15990. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m);
  15991. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m);
  15992. Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m);
  15993. Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m);
  15994. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m);
  15995. Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m);
  15996. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m);
  15997. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m);
  15998. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m);
  15999. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m);
  16000. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m);
  16001. #endif
  16002. Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m);
  16003. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m);
  16004. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m);
  16005. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m);
  16006. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m);
  16007. #ifdef Vc_IMPL_AVX2
  16008. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m);
  16009. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m);
  16010. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m);
  16011. Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m);
  16012. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m);
  16013. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m);
  16014. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m);
  16015. Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m);
  16016. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m);
  16017. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m);
  16018. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m);
  16019. Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m);
  16020. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m);
  16021. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m);
  16022. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m);
  16023. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m);
  16024. Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m);
  16025. Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m);
  16026. Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m);
  16027. Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m);
  16028. #endif
  16029. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m);
  16030. #ifdef Vc_IMPL_AVX2
  16031. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m);
  16032. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m);
  16033. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m);
  16034. Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m);
  16035. Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m);
  16036. Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m);
  16037. Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m);
  16038. Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m);
  16039. Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m);
  16040. Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m);
  16041. #endif
  16042. template <typename Return, typename T>
  16043. Vc_INTRINSIC Vc_CONST Return
  16044. simd_cast(Scalar::Mask<T> k,
  16045. enable_if<AVX2::is_mask<Return>::value> = nullarg);
  16046. template <typename Return, typename T>
  16047. Vc_INTRINSIC Vc_CONST Return
  16048. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
  16049. enable_if<AVX2::is_mask<Return>::value> = nullarg);
  16050. template <typename Return, typename T>
  16051. Vc_INTRINSIC Vc_CONST Return simd_cast(
  16052. Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  16053. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)> = nullarg);
  16054. template <typename Return, typename T>
  16055. Vc_INTRINSIC Vc_CONST Return simd_cast(
  16056. Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  16057. Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
  16058. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)> = nullarg);
  16059. template <typename Return, typename T>
  16060. Vc_INTRINSIC Vc_CONST Return
  16061. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  16062. Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
  16063. Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
  16064. Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
  16065. Scalar::Mask<T> k14, Scalar::Mask<T> k15,
  16066. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)> = nullarg);
  16067. Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m);
  16068. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m);
  16069. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m);
  16070. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m);
  16071. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m);
  16072. Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m);
  16073. Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m);
  16074. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m);
  16075. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m);
  16076. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m);
  16077. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m);
  16078. Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m);
  16079. #ifdef Vc_IMPL_AVX2
  16080. Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m);
  16081. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m);
  16082. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m);
  16083. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m);
  16084. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m);
  16085. Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m);
  16086. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m);
  16087. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m);
  16088. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m);
  16089. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m);
  16090. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m);
  16091. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m);
  16092. Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m);
  16093. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m);
  16094. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m);
  16095. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m);
  16096. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m);
  16097. Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m);
  16098. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m);
  16099. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m);
  16100. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m);
  16101. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m);
  16102. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m);
  16103. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m);
  16104. #endif
  16105. Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m);
  16106. Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m);
  16107. template <typename To, typename FromT>
  16108. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask<FromT> x,
  16109. enable_if<Scalar::is_mask<To>::value> = nullarg);
  16110. template <typename Return, int offset, typename From>
  16111. Vc_INTRINSIC Vc_CONST enable_if<
  16112. (offset == 0 &&
  16113. ((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
  16114. Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
  16115. (AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
  16116. Traits::is_simd_mask<Return>::value &&
  16117. !Traits::isSimdMaskArray<Return>::value))),
  16118. Return>
  16119. simd_cast(const From &x);
  16120. template <typename Return, int offset, typename From>
  16121. Vc_INTRINSIC Vc_CONST Return simd_cast(
  16122. const From &x,
  16123. enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
  16124. AVX2::is_vector<Return>::value) ||
  16125. (SSE::is_mask<From>::value &&
  16126. AVX2::is_mask<Return>::value))> = nullarg);
  16127. template <typename Return, int offset, typename T>
  16128. Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
  16129. Return>
  16130. simd_cast(AVX2::Vector<T> x);
  16131. template <typename Return, int offset, typename T>
  16132. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
  16133. sizeof(AVX2::Vector<T>) == 32),
  16134. Return>
  16135. simd_cast(AVX2::Vector<T> x);
  16136. template <typename Return, int offset, typename T>
  16137. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
  16138. sizeof(AVX2::Vector<T>) == 16),
  16139. Return>
  16140. simd_cast(AVX2::Vector<T> x);
  16141. Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1);
  16142. Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1);
  16143. Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1);
  16144. Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1);
  16145. template <typename Return, int offset, typename T>
  16146. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
  16147. sizeof(AVX2::Mask<T>) == 32),
  16148. Return>
  16149. simd_cast(AVX2::Mask<T> x);
  16150. template <typename Return, int offset, typename T>
  16151. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
  16152. sizeof(AVX2::Mask<T>) == 16),
  16153. Return>
  16154. simd_cast(AVX2::Mask<T> x);
  16155. #undef Vc_SIMD_CAST_AVX_1
  16156. #define Vc_SIMD_CAST_AVX_1(from_,to_) \
  16157. template <typename To> \
  16158. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \
  16159. enable_if<std::is_same<To, AVX2::to_>::value>)
  16160. #undef Vc_SIMD_CAST_AVX_2
  16161. #define Vc_SIMD_CAST_AVX_2(from_,to_) \
  16162. static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \
  16163. "this type combination is wrong"); \
  16164. template <typename To> \
  16165. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \
  16166. enable_if<std::is_same<To, AVX2::to_>::value>)
  16167. #undef Vc_SIMD_CAST_AVX_3
  16168. #define Vc_SIMD_CAST_AVX_3(from_,to_) \
  16169. template <typename To> \
  16170. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
  16171. enable_if<std::is_same<To, AVX2::to_>::value>)
  16172. #undef Vc_SIMD_CAST_AVX_4
  16173. #define Vc_SIMD_CAST_AVX_4(from_,to_) \
  16174. template <typename To> \
  16175. Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
  16176. AVX2::from_ x3, \
  16177. enable_if<std::is_same<To, AVX2::to_>::value>)
  16178. #undef Vc_SIMD_CAST_1
  16179. #define Vc_SIMD_CAST_1(from_,to_) \
  16180. template <typename To> \
  16181. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
  16182. #undef Vc_SIMD_CAST_2
  16183. #define Vc_SIMD_CAST_2(from_,to_) \
  16184. template <typename To> \
  16185. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
  16186. enable_if<std::is_same<To, to_>::value>)
  16187. #undef Vc_SIMD_CAST_3
  16188. #define Vc_SIMD_CAST_3(from_,to_) \
  16189. template <typename To> \
  16190. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \
  16191. enable_if<std::is_same<To, to_>::value>)
  16192. #undef Vc_SIMD_CAST_4
  16193. #define Vc_SIMD_CAST_4(from_,to_) \
  16194. template <typename To> \
  16195. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
  16196. enable_if<std::is_same<To, to_>::value>)
  16197. #undef Vc_SIMD_CAST_5
  16198. #define Vc_SIMD_CAST_5(from_,to_) \
  16199. template <typename To> \
  16200. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  16201. enable_if<std::is_same<To, to_>::value>)
  16202. #undef Vc_SIMD_CAST_6
  16203. #define Vc_SIMD_CAST_6(from_,to_) \
  16204. template <typename To> \
  16205. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  16206. from_ x5, \
  16207. enable_if<std::is_same<To, to_>::value>)
  16208. #undef Vc_SIMD_CAST_7
  16209. #define Vc_SIMD_CAST_7(from_,to_) \
  16210. template <typename To> \
  16211. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  16212. from_ x5, from_ x6, \
  16213. enable_if<std::is_same<To, to_>::value>)
  16214. #undef Vc_SIMD_CAST_8
  16215. #define Vc_SIMD_CAST_8(from_,to_) \
  16216. template <typename To> \
  16217. Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
  16218. from_ x5, from_ x6, from_ x7, \
  16219. enable_if<std::is_same<To, to_>::value>)
  16220. #undef Vc_SIMD_CAST_OFFSET
  16221. #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
  16222. static_assert(from_::size() >= to_::size() * (offset_ + 1), \
  16223. "this offset cannot exist for this type combination"); \
  16224. template <typename To, int offset> \
  16225. Vc_INTRINSIC Vc_CONST To simd_cast( \
  16226. from_ x, enable_if<(offset == offset_ && std::is_same<To, to_>::value)>)
  16227. template <typename To, typename From>
  16228. Vc_INTRINSIC Vc_CONST To
  16229. simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  16230. SSE::Vector<typename To::EntryType>::Size == To::Size)>)
  16231. {
  16232. return simd_cast<SSE::Vector<typename To::EntryType>>(x).data();
  16233. }
  16234. template <typename To, typename From>
  16235. Vc_INTRINSIC Vc_CONST To
  16236. simd_cast(From x0, From x1,
  16237. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  16238. SSE::Vector<typename To::EntryType>::Size == To::Size)>)
  16239. {
  16240. return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1).data();
  16241. }
  16242. template <typename To, typename From>
  16243. Vc_INTRINSIC Vc_CONST To
  16244. simd_cast(From x0, From x1, From x2,
  16245. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  16246. SSE::Vector<typename To::EntryType>::Size == To::Size)>)
  16247. {
  16248. return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2).data();
  16249. }
  16250. template <typename To, typename From>
  16251. Vc_INTRINSIC Vc_CONST To
  16252. simd_cast(From x0, From x1, From x2, From x3,
  16253. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  16254. SSE::Vector<typename To::EntryType>::Size == To::Size)>)
  16255. {
  16256. return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3).data();
  16257. }
  16258. template <typename To, typename From>
  16259. Vc_INTRINSIC Vc_CONST To
  16260. simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
  16261. enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
  16262. SSE::Vector<typename To::EntryType>::Size == To::Size)>)
  16263. {
  16264. return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3, x4, x5, x6, x7)
  16265. .data();
  16266. }
  16267. Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); }
  16268. #ifdef Vc_IMPL_AVX2
  16269. Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); }
  16270. Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); }
  16271. Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); }
  16272. Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert<ushort, double>(AVX::lo128(x.data())); }
  16273. #endif
  16274. Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); }
  16275. #ifdef Vc_IMPL_AVX2
  16276. Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); }
  16277. Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); }
  16278. Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); }
  16279. Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert<ushort, float>(AVX::lo128(x.data())); }
  16280. #endif
  16281. Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); }
  16282. #ifdef Vc_IMPL_AVX2
  16283. Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); }
  16284. Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); }
  16285. Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); }
  16286. Vc_SIMD_CAST_AVX_1( short_v, int_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
  16287. Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
  16288. #endif
  16289. #ifdef Vc_IMPL_AVX2
  16290. Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); }
  16291. #endif
  16292. #ifdef Vc_IMPL_AVX2
  16293. Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert<double, uint>(x.data())); }
  16294. Vc_SIMD_CAST_AVX_1( float_v, uint_v) {
  16295. return _mm256_blendv_epi8(
  16296. _mm256_cvttps_epi32(x.data()),
  16297. _mm256_add_epi32(
  16298. _mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())),
  16299. AVX::set2power31_epu32()),
  16300. _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps())));
  16301. }
  16302. Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); }
  16303. Vc_SIMD_CAST_AVX_1( short_v, uint_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
  16304. Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
  16305. #endif
  16306. #ifdef Vc_IMPL_AVX2
  16307. Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert<double, uint>(x0.data()), AVX::convert<double, uint>(x1.data())); }
  16308. #endif
  16309. #ifdef Vc_IMPL_AVX2
  16310. Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); }
  16311. Vc_SIMD_CAST_AVX_1( float_v, short_v) {
  16312. const auto tmp = _mm256_cvttps_epi32(x.data());
  16313. return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
  16314. }
  16315. Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); }
  16316. Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert<uint, short>(x.data())); }
  16317. Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); }
  16318. #endif
  16319. #ifdef Vc_IMPL_AVX2
  16320. Vc_SIMD_CAST_AVX_2(double_v, short_v) {
  16321. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16322. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16323. return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1));
  16324. }
  16325. Vc_SIMD_CAST_AVX_2( float_v, short_v) {
  16326. using AVX2::short_v;
  16327. using AVX2::int_v;
  16328. return simd_cast<short_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
  16329. }
  16330. Vc_SIMD_CAST_AVX_2( int_v, short_v) {
  16331. const auto shuf = _mm256_setr_epi8(
  16332. 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  16333. 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
  16334. auto a = _mm256_shuffle_epi8(x0.data(), shuf);
  16335. auto b = _mm256_shuffle_epi8(x1.data(), shuf);
  16336. return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
  16337. }
  16338. Vc_SIMD_CAST_AVX_2( uint_v, short_v) {
  16339. const auto shuf = _mm256_setr_epi8(
  16340. 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  16341. 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
  16342. auto a = _mm256_shuffle_epi8(x0.data(), shuf);
  16343. auto b = _mm256_shuffle_epi8(x1.data(), shuf);
  16344. return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
  16345. }
  16346. #endif
  16347. #ifdef Vc_IMPL_AVX2
  16348. Vc_SIMD_CAST_AVX_3(double_v, short_v) {
  16349. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16350. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16351. const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
  16352. return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128()));
  16353. }
  16354. #endif
  16355. #ifdef Vc_IMPL_AVX2
  16356. Vc_SIMD_CAST_AVX_4(double_v, short_v) {
  16357. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16358. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16359. const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
  16360. const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
  16361. return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3));
  16362. }
  16363. #endif
  16364. #ifdef Vc_IMPL_AVX2
  16365. Vc_SIMD_CAST_AVX_1(double_v, ushort_v) {
  16366. const auto tmp = _mm256_cvttpd_epi32(x.data());
  16367. return AVX::zeroExtend(_mm_packus_epi32(tmp, _mm_setzero_si128()));
  16368. }
  16369. Vc_SIMD_CAST_AVX_1( float_v, ushort_v) {
  16370. const auto tmp = _mm256_cvttps_epi32(x.data());
  16371. return AVX::zeroExtend(_mm_packus_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
  16372. }
  16373. Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); }
  16374. Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert<uint, ushort>(x.data())); }
  16375. Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); }
  16376. #endif
  16377. #ifdef Vc_IMPL_AVX2
  16378. Vc_SIMD_CAST_AVX_2(double_v, ushort_v) {
  16379. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16380. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16381. return AVX::zeroExtend(_mm_packus_epi32(tmp0, tmp1));
  16382. }
  16383. Vc_SIMD_CAST_AVX_2( float_v, ushort_v) {
  16384. using AVX2::ushort_v;
  16385. using AVX2::int_v;
  16386. return simd_cast<ushort_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
  16387. }
  16388. Vc_SIMD_CAST_AVX_2( int_v, ushort_v) {
  16389. auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
  16390. auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
  16391. auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
  16392. auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
  16393. return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
  16394. }
  16395. Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) {
  16396. auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
  16397. auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
  16398. auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
  16399. auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
  16400. return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
  16401. }
  16402. #endif
  16403. #ifdef Vc_IMPL_AVX2
  16404. Vc_SIMD_CAST_AVX_3(double_v, ushort_v) {
  16405. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16406. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16407. const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
  16408. return AVX::concat(_mm_packus_epi32(tmp0, tmp1),
  16409. _mm_packus_epi32(tmp2, _mm_setzero_si128()));
  16410. }
  16411. #endif
  16412. #ifdef Vc_IMPL_AVX2
  16413. Vc_SIMD_CAST_AVX_4(double_v, ushort_v) {
  16414. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16415. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16416. const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
  16417. const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
  16418. return AVX::concat(_mm_packus_epi32(tmp0, tmp1), _mm_packus_epi32(tmp2, tmp3));
  16419. }
  16420. #endif
  16421. Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); }
  16422. Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); }
  16423. Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); }
  16424. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); }
  16425. Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
  16426. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
  16427. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x).data()); }
  16428. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); }
  16429. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); }
  16430. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE::float_v>(x).data()); }
  16431. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); }
  16432. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert<ushort, float>(x.data()); }
  16433. #ifdef Vc_IMPL_AVX2
  16434. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x).data()); }
  16435. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x).data()); }
  16436. Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x).data()); }
  16437. Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
  16438. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE::int_v>(x).data()); }
  16439. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE::uint_v>(x).data()); }
  16440. Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
  16441. Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
  16442. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
  16443. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
  16444. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); }
  16445. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert<ushort, int>(x.data()); }
  16446. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
  16447. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
  16448. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); }
  16449. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert<ushort, uint>(x.data()); }
  16450. Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
  16451. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
  16452. Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
  16453. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
  16454. Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
  16455. Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
  16456. Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
  16457. Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
  16458. #endif
  16459. Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); }
  16460. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x0, x1).data()); }
  16461. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); }
  16462. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); }
  16463. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert<uint, float>(AVX::concat(x0.data(), x1.data())); }
  16464. #ifdef Vc_IMPL_AVX2
  16465. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x0, x1).data()); }
  16466. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x0, x1).data()); }
  16467. Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
  16468. Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
  16469. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::float_v>(x0, x1)); }
  16470. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::float_v>(x0, x1)); }
  16471. Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
  16472. Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
  16473. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
  16474. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
  16475. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
  16476. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
  16477. Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
  16478. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
  16479. Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
  16480. Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
  16481. Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
  16482. Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
  16483. Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
  16484. Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
  16485. #endif
  16486. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
  16487. #ifdef Vc_IMPL_AVX2
  16488. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
  16489. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
  16490. Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2).data()); }
  16491. Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2).data()); }
  16492. Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
  16493. Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
  16494. Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
  16495. Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
  16496. Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
  16497. Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
  16498. #endif
  16499. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
  16500. #ifdef Vc_IMPL_AVX2
  16501. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
  16502. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
  16503. Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2, x3).data()); }
  16504. Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2, x3).data()); }
  16505. Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
  16506. Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
  16507. Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
  16508. Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
  16509. Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
  16510. Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
  16511. #endif
  16512. #ifdef Vc_IMPL_AVX2
  16513. Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
  16514. Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
  16515. #endif
  16516. #ifdef Vc_IMPL_AVX2
  16517. Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
  16518. Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
  16519. #endif
  16520. #ifdef Vc_IMPL_AVX2
  16521. Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
  16522. Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
  16523. #endif
  16524. #ifdef Vc_IMPL_AVX2
  16525. Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
  16526. Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
  16527. #endif
  16528. Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); }
  16529. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); }
  16530. #ifdef Vc_IMPL_AVX2
  16531. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); }
  16532. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
  16533. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); }
  16534. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); }
  16535. #endif
  16536. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<AVX2:: float_v>(x)); }
  16537. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert<double, int>(x.data()); }
  16538. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert<double, unsigned int>(x.data()); }
  16539. Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert<double, short>(x.data()); }
  16540. Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert<double, unsigned short>(x.data()); }
  16541. Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: float_v>(x)); }
  16542. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: float_v>(x)); }
  16543. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: float_v>(x)); }
  16544. Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert<float, short>(x.data()); }
  16545. Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert<float, unsigned short>(x.data()); }
  16546. #ifdef Vc_IMPL_AVX2
  16547. Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert<int, double>(AVX::lo128(x.data())); }
  16548. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert<int, float>(AVX::lo128(x.data())); }
  16549. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
  16550. Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert<int, short>(x.data()); }
  16551. Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert<int, ushort>(x.data()); }
  16552. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert<uint, double>(AVX::lo128(x.data())); }
  16553. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert<uint, float>(AVX::lo128(x.data())); }
  16554. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); }
  16555. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert<uint, short>(x.data()); }
  16556. Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert<uint, ushort>(x.data()); }
  16557. Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: short_v>(x)); }
  16558. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE:: short_v>(x)); }
  16559. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: short_v>(x)); }
  16560. Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: short_v>(x)); }
  16561. Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE:: short_v>(x)); }
  16562. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE::ushort_v>(x)); }
  16563. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE::ushort_v>(x)); }
  16564. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE::ushort_v>(x)); }
  16565. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE::ushort_v>(x)); }
  16566. Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast<SSE:: short_v>(simd_cast<SSE::ushort_v>(x)); }
  16567. #endif
  16568. Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) {
  16569. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16570. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16571. return _mm_packs_epi32(tmp0, tmp1);
  16572. }
  16573. Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) {
  16574. const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
  16575. const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
  16576. return _mm_packus_epi32(tmp0, tmp1);
  16577. }
  16578. template <typename Return, typename T>
  16579. Vc_INTRINSIC Vc_CONST Return
  16580. simd_cast(Scalar::Vector<T> x,
  16581. enable_if<std::is_same<Return, AVX2::double_v>::value>)
  16582. {
  16583. return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.));
  16584. }
  16585. template <typename Return, typename T>
  16586. Vc_INTRINSIC Vc_CONST Return
  16587. simd_cast(Scalar::Vector<T> x,
  16588. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16589. {
  16590. return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f));
  16591. }
  16592. #ifdef Vc_IMPL_AVX2
  16593. template <typename Return, typename T>
  16594. Vc_INTRINSIC Vc_CONST Return
  16595. simd_cast(Scalar::Vector<T> x,
  16596. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16597. {
  16598. return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0);
  16599. }
  16600. template <typename Return, typename T>
  16601. Vc_INTRINSIC Vc_CONST Return
  16602. simd_cast(Scalar::Vector<T> x,
  16603. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16604. {
  16605. return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0);
  16606. }
  16607. template <typename Return, typename T>
  16608. Vc_INTRINSIC Vc_CONST Return
  16609. simd_cast(Scalar::Vector<T> x,
  16610. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16611. {
  16612. return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16613. }
  16614. template <typename Return, typename T>
  16615. Vc_INTRINSIC Vc_CONST Return
  16616. simd_cast(Scalar::Vector<T> x,
  16617. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16618. {
  16619. return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16620. }
  16621. #endif
  16622. template <typename Return, typename T>
  16623. Vc_INTRINSIC Vc_CONST Return
  16624. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16625. enable_if<std::is_same<Return, AVX2::double_v>::value>)
  16626. {
  16627. return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data()));
  16628. }
  16629. template <typename Return, typename T>
  16630. Vc_INTRINSIC Vc_CONST Return
  16631. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16632. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16633. {
  16634. return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f));
  16635. }
  16636. #ifdef Vc_IMPL_AVX2
  16637. template <typename Return, typename T>
  16638. Vc_INTRINSIC Vc_CONST Return
  16639. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16640. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16641. {
  16642. return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
  16643. }
  16644. template <typename Return, typename T>
  16645. Vc_INTRINSIC Vc_CONST Return
  16646. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16647. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16648. {
  16649. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0);
  16650. }
  16651. template <typename Return, typename T>
  16652. Vc_INTRINSIC Vc_CONST Return
  16653. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16654. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16655. {
  16656. return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16657. }
  16658. template <typename Return, typename T>
  16659. Vc_INTRINSIC Vc_CONST Return
  16660. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
  16661. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16662. {
  16663. return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16664. }
  16665. #endif
  16666. template <typename Return, typename T>
  16667. Vc_INTRINSIC Vc_CONST Return
  16668. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16669. enable_if<std::is_same<Return, AVX2::double_v>::value>)
  16670. {
  16671. return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0);
  16672. }
  16673. template <typename Return, typename T>
  16674. Vc_INTRINSIC Vc_CONST Return
  16675. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16676. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16677. {
  16678. return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0));
  16679. }
  16680. #ifdef Vc_IMPL_AVX2
  16681. template <typename Return, typename T>
  16682. Vc_INTRINSIC Vc_CONST Return
  16683. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16684. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16685. {
  16686. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
  16687. }
  16688. template <typename Return, typename T>
  16689. Vc_INTRINSIC Vc_CONST Return
  16690. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16691. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16692. {
  16693. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0,
  16694. 0, 0);
  16695. }
  16696. template <typename Return, typename T>
  16697. Vc_INTRINSIC Vc_CONST Return
  16698. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16699. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16700. {
  16701. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16702. }
  16703. template <typename Return, typename T>
  16704. Vc_INTRINSIC Vc_CONST Return
  16705. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16706. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16707. {
  16708. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16709. }
  16710. #endif
  16711. template <typename Return, typename T>
  16712. Vc_INTRINSIC Vc_CONST Return
  16713. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16714. Scalar::Vector<T> x3,
  16715. enable_if<std::is_same<Return, AVX2::double_v>::value>)
  16716. {
  16717. return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data());
  16718. }
  16719. template <typename Return, typename T>
  16720. Vc_INTRINSIC Vc_CONST Return
  16721. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16722. Scalar::Vector<T> x3,
  16723. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16724. {
  16725. return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data()));
  16726. }
  16727. #ifdef Vc_IMPL_AVX2
  16728. template <typename Return, typename T>
  16729. Vc_INTRINSIC Vc_CONST Return
  16730. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16731. Scalar::Vector<T> x3,
  16732. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16733. {
  16734. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
  16735. }
  16736. template <typename Return, typename T>
  16737. Vc_INTRINSIC Vc_CONST Return
  16738. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16739. Scalar::Vector<T> x3,
  16740. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16741. {
  16742. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  16743. uint(x3.data()), 0, 0, 0, 0);
  16744. }
  16745. template <typename Return, typename T>
  16746. Vc_INTRINSIC Vc_CONST Return
  16747. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16748. Scalar::Vector<T> x3,
  16749. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16750. {
  16751. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16752. }
  16753. template <typename Return, typename T>
  16754. Vc_INTRINSIC Vc_CONST Return
  16755. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16756. Scalar::Vector<T> x3,
  16757. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16758. {
  16759. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16760. }
  16761. #endif
  16762. template <typename Return, typename T>
  16763. Vc_INTRINSIC Vc_CONST Return
  16764. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16765. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  16766. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16767. {
  16768. return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
  16769. }
  16770. #ifdef Vc_IMPL_AVX2
  16771. template <typename Return, typename T>
  16772. Vc_INTRINSIC Vc_CONST Return
  16773. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16774. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  16775. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16776. {
  16777. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
  16778. }
  16779. template <typename Return, typename T>
  16780. Vc_INTRINSIC Vc_CONST Return
  16781. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16782. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  16783. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16784. {
  16785. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  16786. uint(x3.data()), uint(x4.data()), 0, 0, 0);
  16787. }
  16788. template <typename Return, typename T>
  16789. Vc_INTRINSIC Vc_CONST Return
  16790. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16791. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  16792. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16793. {
  16794. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16795. }
  16796. template <typename Return, typename T>
  16797. Vc_INTRINSIC Vc_CONST Return
  16798. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16799. Scalar::Vector<T> x3, Scalar::Vector<T> x4,
  16800. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16801. {
  16802. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16803. }
  16804. #endif
  16805. template <typename Return, typename T>
  16806. Vc_INTRINSIC Vc_CONST Return
  16807. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16808. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16809. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16810. {
  16811. return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16812. x5.data(), 0, 0);
  16813. }
  16814. #ifdef Vc_IMPL_AVX2
  16815. template <typename Return, typename T>
  16816. Vc_INTRINSIC Vc_CONST Return
  16817. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16818. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16819. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16820. {
  16821. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16822. x5.data(), 0, 0);
  16823. }
  16824. template <typename Return, typename T>
  16825. Vc_INTRINSIC Vc_CONST Return
  16826. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16827. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16828. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16829. {
  16830. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  16831. uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0);
  16832. }
  16833. template <typename Return, typename T>
  16834. Vc_INTRINSIC Vc_CONST Return
  16835. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16836. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16837. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16838. {
  16839. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16840. x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16841. }
  16842. template <typename Return, typename T>
  16843. Vc_INTRINSIC Vc_CONST Return
  16844. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16845. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16846. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16847. {
  16848. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16849. x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16850. }
  16851. #endif
  16852. template <typename Return, typename T>
  16853. Vc_INTRINSIC Vc_CONST Return
  16854. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16855. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16856. Scalar::Vector<T> x6,
  16857. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16858. {
  16859. return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16860. x5.data(), x6.data(), 0);
  16861. }
  16862. #ifdef Vc_IMPL_AVX2
  16863. template <typename Return, typename T>
  16864. Vc_INTRINSIC Vc_CONST Return
  16865. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16866. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16867. Scalar::Vector<T> x6,
  16868. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16869. {
  16870. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16871. x5.data(), x6.data(), 0);
  16872. }
  16873. template <typename Return, typename T>
  16874. Vc_INTRINSIC Vc_CONST Return
  16875. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16876. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16877. Scalar::Vector<T> x6,
  16878. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16879. {
  16880. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  16881. uint(x3.data()), uint(x4.data()), uint(x5.data()),
  16882. uint(x6.data()), 0);
  16883. }
  16884. template <typename Return, typename T>
  16885. Vc_INTRINSIC Vc_CONST Return
  16886. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16887. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16888. Scalar::Vector<T> x6,
  16889. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16890. {
  16891. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16892. x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16893. }
  16894. template <typename Return, typename T>
  16895. Vc_INTRINSIC Vc_CONST Return
  16896. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16897. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16898. Scalar::Vector<T> x6,
  16899. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16900. {
  16901. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16902. x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
  16903. }
  16904. #endif
  16905. template <typename Return, typename T>
  16906. Vc_INTRINSIC Vc_CONST Return
  16907. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16908. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16909. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  16910. enable_if<std::is_same<Return, AVX2::float_v>::value>)
  16911. {
  16912. return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16913. x5.data(), x6.data(), x7.data());
  16914. }
  16915. #ifdef Vc_IMPL_AVX2
  16916. template <typename Return, typename T>
  16917. Vc_INTRINSIC Vc_CONST Return
  16918. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16919. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16920. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  16921. enable_if<std::is_same<Return, AVX2::int_v>::value>)
  16922. {
  16923. return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16924. x5.data(), x6.data(), x7.data());
  16925. }
  16926. template <typename Return, typename T>
  16927. Vc_INTRINSIC Vc_CONST Return
  16928. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16929. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16930. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  16931. enable_if<std::is_same<Return, AVX2::uint_v>::value>)
  16932. {
  16933. return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
  16934. uint(x3.data()), uint(x4.data()), uint(x5.data()),
  16935. uint(x6.data()), uint(x7.data()));
  16936. }
  16937. template <typename Return, typename T>
  16938. Vc_INTRINSIC Vc_CONST Return
  16939. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16940. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16941. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  16942. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16943. {
  16944. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16945. x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
  16946. }
  16947. template <typename Return, typename T>
  16948. Vc_INTRINSIC Vc_CONST Return
  16949. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16950. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16951. Scalar::Vector<T> x6, Scalar::Vector<T> x7,
  16952. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16953. {
  16954. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16955. x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
  16956. }
  16957. #endif
  16958. #ifdef Vc_IMPL_AVX2
  16959. template <typename Return, typename T>
  16960. Vc_INTRINSIC Vc_CONST Return
  16961. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16962. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16963. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  16964. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16965. {
  16966. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16967. x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
  16968. 0);
  16969. }
  16970. template <typename Return, typename T>
  16971. Vc_INTRINSIC Vc_CONST Return
  16972. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16973. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16974. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  16975. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  16976. {
  16977. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16978. x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
  16979. 0);
  16980. }
  16981. #endif
  16982. #ifdef Vc_IMPL_AVX2
  16983. template <typename Return, typename T>
  16984. Vc_INTRINSIC Vc_CONST Return
  16985. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16986. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16987. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  16988. Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::short_v>::value>)
  16989. {
  16990. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  16991. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
  16992. 0, 0, 0, 0);
  16993. }
  16994. template <typename Return, typename T>
  16995. Vc_INTRINSIC Vc_CONST Return
  16996. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  16997. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  16998. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  16999. Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17000. {
  17001. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17002. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
  17003. 0, 0, 0, 0);
  17004. }
  17005. #endif
  17006. #ifdef Vc_IMPL_AVX2
  17007. template <typename Return, typename T>
  17008. Vc_INTRINSIC Vc_CONST Return
  17009. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17010. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17011. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17012. Scalar::Vector<T> x9, Scalar::Vector<T> x10,
  17013. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17014. {
  17015. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17016. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17017. x10.data(), 0, 0, 0, 0, 0);
  17018. }
  17019. template <typename Return, typename T>
  17020. Vc_INTRINSIC Vc_CONST Return
  17021. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17022. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17023. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17024. Scalar::Vector<T> x9, Scalar::Vector<T> x10,
  17025. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17026. {
  17027. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17028. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17029. x10.data(), 0, 0, 0, 0, 0);
  17030. }
  17031. #endif
  17032. #ifdef Vc_IMPL_AVX2
  17033. template <typename Return, typename T>
  17034. Vc_INTRINSIC Vc_CONST Return
  17035. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17036. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17037. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17038. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17039. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17040. {
  17041. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17042. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17043. x10.data(), x11.data(), 0, 0, 0, 0);
  17044. }
  17045. template <typename Return, typename T>
  17046. Vc_INTRINSIC Vc_CONST Return
  17047. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17048. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17049. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17050. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17051. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17052. {
  17053. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17054. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17055. x10.data(), x11.data(), 0, 0, 0, 0);
  17056. }
  17057. #endif
  17058. #ifdef Vc_IMPL_AVX2
  17059. template <typename Return, typename T>
  17060. Vc_INTRINSIC Vc_CONST Return
  17061. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17062. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17063. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17064. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17065. Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17066. {
  17067. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17068. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17069. x10.data(), x11.data(), x12.data(), 0, 0, 0);
  17070. }
  17071. template <typename Return, typename T>
  17072. Vc_INTRINSIC Vc_CONST Return
  17073. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17074. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17075. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17076. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17077. Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17078. {
  17079. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17080. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17081. x10.data(), x11.data(), x12.data(), 0, 0, 0);
  17082. }
  17083. #endif
  17084. #ifdef Vc_IMPL_AVX2
  17085. template <typename Return, typename T>
  17086. Vc_INTRINSIC Vc_CONST Return
  17087. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17088. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17089. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17090. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17091. Scalar::Vector<T> x12, Scalar::Vector<T> x13,
  17092. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17093. {
  17094. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17095. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17096. x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
  17097. }
  17098. template <typename Return, typename T>
  17099. Vc_INTRINSIC Vc_CONST Return
  17100. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17101. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17102. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17103. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17104. Scalar::Vector<T> x12, Scalar::Vector<T> x13,
  17105. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17106. {
  17107. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17108. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17109. x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
  17110. }
  17111. #endif
  17112. #ifdef Vc_IMPL_AVX2
  17113. template <typename Return, typename T>
  17114. Vc_INTRINSIC Vc_CONST Return
  17115. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17116. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17117. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17118. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17119. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  17120. enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17121. {
  17122. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17123. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17124. x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
  17125. 0);
  17126. }
  17127. template <typename Return, typename T>
  17128. Vc_INTRINSIC Vc_CONST Return
  17129. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17130. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17131. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17132. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17133. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  17134. enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17135. {
  17136. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17137. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17138. x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
  17139. 0);
  17140. }
  17141. #endif
  17142. #ifdef Vc_IMPL_AVX2
  17143. template <typename Return, typename T>
  17144. Vc_INTRINSIC Vc_CONST Return
  17145. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17146. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17147. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17148. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17149. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  17150. Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::short_v>::value>)
  17151. {
  17152. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17153. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17154. x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
  17155. x15.data());
  17156. }
  17157. template <typename Return, typename T>
  17158. Vc_INTRINSIC Vc_CONST Return
  17159. simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
  17160. Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
  17161. Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
  17162. Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
  17163. Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
  17164. Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
  17165. {
  17166. return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
  17167. x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
  17168. x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
  17169. x15.data());
  17170. }
  17171. #endif
  17172. template <typename To, typename FromT>
  17173. Vc_INTRINSIC Vc_CONST To
  17174. simd_cast(AVX2::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value>)
  17175. {
  17176. return static_cast<To>(x[0]);
  17177. }
  17178. template <typename Return, typename T>
  17179. Vc_INTRINSIC Vc_CONST Return
  17180. simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value>)
  17181. {
  17182. return {Detail::mask_cast<Mask<T, VectorAbi::Avx>::Size, Return::Size,
  17183. typename Return::VectorTypeF>(k.dataI())};
  17184. }
  17185. Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
  17186. #ifdef Vc_IMPL_AVX2
  17187. Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
  17188. Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
  17189. Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
  17190. Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
  17191. Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17192. Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17193. Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17194. Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17195. Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17196. Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
  17197. #endif
  17198. #ifdef Vc_IMPL_AVX2
  17199. Vc_SIMD_CAST_AVX_4(double_m, short_m)
  17200. {
  17201. using namespace AVX;
  17202. const auto tmp = _mm256_packs_epi32(
  17203. _mm256_packs_epi32(x0.dataI(), x1.dataI())
  17204. ,
  17205. _mm256_packs_epi32(x2.dataI(), x3.dataI())
  17206. );
  17207. return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)),
  17208. _mm_unpackhi_epi32(lo128(tmp), hi128(tmp)));
  17209. }
  17210. Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast<AVX2::short_m>(x0, x1, x2, x3).data(); }
  17211. #endif
  17212. Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); }
  17213. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast<SSE:: float_m>(x).data()); }
  17214. #ifdef Vc_IMPL_AVX2
  17215. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast<SSE:: int_m>(x).data()); }
  17216. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast<SSE:: uint_m>(x).data()); }
  17217. Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17218. Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17219. #endif
  17220. Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
  17221. Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
  17222. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
  17223. Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
  17224. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
  17225. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
  17226. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
  17227. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
  17228. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
  17229. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
  17230. #ifdef Vc_IMPL_AVX2
  17231. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
  17232. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
  17233. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
  17234. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
  17235. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
  17236. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
  17237. Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17238. Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17239. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17240. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17241. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
  17242. Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17243. Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17244. Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17245. Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17246. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
  17247. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
  17248. Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
  17249. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
  17250. Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
  17251. #endif
  17252. Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); }
  17253. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
  17254. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
  17255. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
  17256. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
  17257. #ifdef Vc_IMPL_AVX2
  17258. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
  17259. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
  17260. Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
  17261. Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
  17262. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
  17263. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
  17264. Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17265. Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17266. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
  17267. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
  17268. Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17269. Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17270. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
  17271. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
  17272. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17273. Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
  17274. Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
  17275. Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
  17276. Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
  17277. Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
  17278. #endif
  17279. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
  17280. #ifdef Vc_IMPL_AVX2
  17281. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
  17282. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
  17283. Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
  17284. Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
  17285. Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17286. Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17287. Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17288. Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17289. Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17290. Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
  17291. #endif
  17292. template <typename Return, typename T>
  17293. Vc_INTRINSIC Vc_CONST Return
  17294. simd_cast(Scalar::Mask<T> k, enable_if<AVX2::is_mask<Return>::value>)
  17295. {
  17296. Return r{false};
  17297. r[0] = k.data();
  17298. return r;
  17299. }
  17300. template <typename Return, typename T>
  17301. Vc_INTRINSIC Vc_CONST Return
  17302. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
  17303. enable_if<AVX2::is_mask<Return>::value>)
  17304. {
  17305. Return r{false};
  17306. r[0] = k0.data();
  17307. r[1] = k1.data();
  17308. return r;
  17309. }
  17310. template <typename Return, typename T>
  17311. Vc_INTRINSIC Vc_CONST Return
  17312. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  17313. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)>)
  17314. {
  17315. Return r{false};
  17316. r[0] = k0.data();
  17317. r[1] = k1.data();
  17318. r[2] = k2.data();
  17319. r[3] = k3.data();
  17320. return r;
  17321. }
  17322. template <typename Return, typename T>
  17323. Vc_INTRINSIC Vc_CONST Return
  17324. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  17325. Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
  17326. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)>)
  17327. {
  17328. Return r{false};
  17329. r[0] = k0.data();
  17330. r[1] = k1.data();
  17331. r[2] = k2.data();
  17332. r[3] = k3.data();
  17333. r[4] = k4.data();
  17334. r[5] = k5.data();
  17335. r[6] = k6.data();
  17336. r[7] = k7.data();
  17337. return r;
  17338. }
  17339. template <typename Return, typename T>
  17340. Vc_INTRINSIC Vc_CONST Return
  17341. simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
  17342. Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
  17343. Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
  17344. Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
  17345. Scalar::Mask<T> k14, Scalar::Mask<T> k15,
  17346. enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)>)
  17347. {
  17348. Return r{false};
  17349. r[0] = k0.data();
  17350. r[1] = k1.data();
  17351. r[2] = k2.data();
  17352. r[3] = k3.data();
  17353. r[4] = k4.data();
  17354. r[5] = k5.data();
  17355. r[6] = k6.data();
  17356. r[7] = k7.data();
  17357. r[8] = k8.data();
  17358. r[9] = k9.data();
  17359. r[10] = k10.data();
  17360. r[11] = k11.data();
  17361. r[12] = k12.data();
  17362. r[13] = k13.data();
  17363. r[14] = k14.data();
  17364. r[15] = k15.data();
  17365. return r;
  17366. }
  17367. Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); }
  17368. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17369. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17370. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17371. Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
  17372. Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
  17373. Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); }
  17374. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); }
  17375. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); }
  17376. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); }
  17377. Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17378. Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17379. #ifdef Vc_IMPL_AVX2
  17380. Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
  17381. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
  17382. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
  17383. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
  17384. Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17385. Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17386. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
  17387. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
  17388. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
  17389. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
  17390. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17391. Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
  17392. Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17393. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17394. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17395. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17396. Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17397. Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::short_m(AVX::lo128(x.data()))); }
  17398. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17399. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17400. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17401. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17402. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17403. Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
  17404. #endif
  17405. Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
  17406. Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
  17407. template <typename To, typename FromT>
  17408. Vc_INTRINSIC Vc_CONST To
  17409. simd_cast(AVX2::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value>)
  17410. {
  17411. return static_cast<To>(x[0]);
  17412. }
  17413. template <typename Return, int offset, typename From>
  17414. Vc_INTRINSIC Vc_CONST enable_if<
  17415. (offset == 0 &&
  17416. ((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
  17417. Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
  17418. (AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
  17419. Traits::is_simd_mask<Return>::value &&
  17420. !Traits::isSimdMaskArray<Return>::value))),
  17421. Return>
  17422. simd_cast(const From &x)
  17423. {
  17424. return simd_cast<Return>(x);
  17425. }
  17426. template <typename Return, int offset, typename From>
  17427. Vc_INTRINSIC Vc_CONST Return
  17428. simd_cast(const From &x,
  17429. enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
  17430. AVX2::is_vector<Return>::value) ||
  17431. (SSE::is_mask<From>::value &&
  17432. AVX2::is_mask<Return>::value))>)
  17433. {
  17434. return simd_cast<Return>(x);
  17435. }
  17436. template <typename Return, int offset, typename T>
  17437. Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
  17438. Return>
  17439. simd_cast(AVX2::Vector<T> x)
  17440. {
  17441. using V = AVX2::Vector<T>;
  17442. constexpr int shift = sizeof(T) * offset * Return::Size;
  17443. static_assert(shift > 0 && shift < sizeof(x), "");
  17444. if (shift < 16) {
  17445. return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
  17446. _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
  17447. } else if (shift == 16) {
  17448. return simd_cast<Return>(V{Mem::permute128<X1, Const0>(x.data())});
  17449. } else {
  17450. #ifdef Vc_MSVC
  17451. #pragma warning(push)
  17452. #pragma warning(disable : 4556)
  17453. #endif
  17454. return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
  17455. _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))});
  17456. #ifdef Vc_MSVC
  17457. #pragma warning(pop)
  17458. #endif
  17459. }
  17460. }
  17461. template <typename Return, int offset, typename T>
  17462. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
  17463. sizeof(AVX2::Vector<T>) == 32),
  17464. Return>
  17465. simd_cast(AVX2::Vector<T> x)
  17466. {
  17467. using V = AVX2::Vector<T>;
  17468. constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
  17469. static_assert(shift > 0, "");
  17470. static_assert(shift < sizeof(V), "");
  17471. using SseVector = SSE::Vector<typename V::EntryType>;
  17472. if (shift == 16) {
  17473. return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
  17474. }
  17475. using Intrin = typename SseVector::VectorType;
  17476. return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
  17477. _mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())),
  17478. AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
  17479. }
  17480. template <typename Return, int offset, typename T>
  17481. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
  17482. sizeof(AVX2::Vector<T>) == 16),
  17483. Return>
  17484. simd_cast(AVX2::Vector<T> x)
  17485. {
  17486. using V = AVX2::Vector<T>;
  17487. constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
  17488. static_assert(shift > 0, "");
  17489. static_assert(shift < sizeof(V), "");
  17490. using SseVector = SSE::Vector<typename V::EntryType>;
  17491. return simd_cast<Return>(SseVector{_mm_srli_si128(x.data(), shift)});
  17492. }
  17493. Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
  17494. Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
  17495. template <typename Return, int offset, typename T>
  17496. Vc_INTRINSIC Vc_CONST Return
  17497. simd_cast(const AVX2::Mask<T> &k,
  17498. enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
  17499. AVX2::Mask<T>::Size == Return::Size * 2)> = nullarg)
  17500. {
  17501. const auto tmp = AVX::hi128(k.dataI());
  17502. return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp));
  17503. }
  17504. template <typename Return, int offset, typename T>
  17505. Vc_INTRINSIC Vc_CONST Return
  17506. simd_cast(const AVX2::Mask<T> &k,
  17507. enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
  17508. AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
  17509. {
  17510. auto tmp = AVX::lo128(k.dataI());
  17511. tmp = _mm_unpackhi_epi8(tmp, tmp);
  17512. return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
  17513. }
  17514. template <typename Return, int offset, typename T>
  17515. Vc_INTRINSIC Vc_CONST Return
  17516. simd_cast(const AVX2::Mask<T> &k,
  17517. enable_if<(AVX2::is_mask<Return>::value && offset == 2 &&
  17518. AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
  17519. {
  17520. auto tmp = AVX::hi128(k.dataI());
  17521. tmp = _mm_unpacklo_epi8(tmp, tmp);
  17522. return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
  17523. }
  17524. template <typename Return, int offset, typename T>
  17525. Vc_INTRINSIC Vc_CONST Return
  17526. simd_cast(const AVX2::Mask<T> &k,
  17527. enable_if<(AVX2::is_mask<Return>::value && offset == 3 &&
  17528. AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
  17529. {
  17530. auto tmp = AVX::hi128(k.dataI());
  17531. tmp = _mm_unpackhi_epi8(tmp, tmp);
  17532. return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
  17533. }
  17534. Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
  17535. Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
  17536. template <typename Return, int offset, typename T>
  17537. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
  17538. sizeof(AVX2::Mask<T>) == 32),
  17539. Return>
  17540. simd_cast(AVX2::Mask<T> x)
  17541. {
  17542. using M = AVX2::Mask<T>;
  17543. constexpr int shift = sizeof(M) / M::Size * offset * Return::Size;
  17544. static_assert(shift > 0, "");
  17545. static_assert(shift < sizeof(M), "");
  17546. using SseVector = SSE::Mask<Traits::entry_type_of<typename M::Vector>>;
  17547. if (shift == 16) {
  17548. return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
  17549. }
  17550. using Intrin = typename SseVector::VectorType;
  17551. return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
  17552. _mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))});
  17553. }
  17554. template <typename Return, int offset, typename T>
  17555. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
  17556. sizeof(AVX2::Mask<T>) == 16),
  17557. Return>
  17558. simd_cast(AVX2::Mask<T> x)
  17559. {
  17560. return simd_cast<Return, offset>(simd_cast<SSE::Mask<T>>(x));
  17561. }
  17562. #undef Vc_SIMD_CAST_AVX_1
  17563. #undef Vc_SIMD_CAST_AVX_2
  17564. #undef Vc_SIMD_CAST_AVX_3
  17565. #undef Vc_SIMD_CAST_AVX_4
  17566. #undef Vc_SIMD_CAST_1
  17567. #undef Vc_SIMD_CAST_2
  17568. #undef Vc_SIMD_CAST_3
  17569. #undef Vc_SIMD_CAST_4
  17570. #undef Vc_SIMD_CAST_5
  17571. #undef Vc_SIMD_CAST_6
  17572. #undef Vc_SIMD_CAST_7
  17573. #undef Vc_SIMD_CAST_8
  17574. #undef Vc_SIMD_CAST_OFFSET
  17575. }
  17576. #endif
  17577. #endif
  17578. #endif
  17579. namespace Vc_VERSIONED_NAMESPACE
  17580. {
  17581. using double_v = Vector<double>;
  17582. using float_v = Vector<float>;
  17583. using int_v = Vector<int>;
  17584. using uint_v = Vector<uint>;
  17585. using short_v = Vector<short>;
  17586. using ushort_v = Vector<ushort>;
  17587. using llong_v = Vector<llong>;
  17588. using ullong_v = Vector<ullong>;
  17589. using long_v = Vector<long>;
  17590. using ulong_v = Vector<ulong>;
  17591. using schar_v = Vector<schar>;
  17592. using uchar_v = Vector<uchar>;
  17593. using double_m = Mask<double>;
  17594. using float_m = Mask< float>;
  17595. using llong_m = Mask< llong>;
  17596. using ullong_m = Mask<ullong>;
  17597. using long_m = Mask< long>;
  17598. using ulong_m = Mask< ulong>;
  17599. using int_m = Mask< int>;
  17600. using uint_m = Mask< uint>;
  17601. using short_m = Mask< short>;
  17602. using ushort_m = Mask<ushort>;
  17603. using schar_m = Mask< schar>;
  17604. using uchar_m = Mask< uchar>;
  17605. typedef Vector<std:: int_least64_t> int_least64_v;
  17606. typedef Vector<std::uint_least64_t> uint_least64_v;
  17607. typedef Vector<std:: int_least32_t> int_least32_v;
  17608. typedef Vector<std::uint_least32_t> uint_least32_v;
  17609. typedef Vector<std:: int_least16_t> int_least16_v;
  17610. typedef Vector<std::uint_least16_t> uint_least16_v;
  17611. typedef Vector<std:: int_least8_t> int_least8_v;
  17612. typedef Vector<std:: uint_least8_t> uint_least8_v;
  17613. typedef Mask<std:: int_least64_t> int_least64_m;
  17614. typedef Mask<std::uint_least64_t> uint_least64_m;
  17615. typedef Mask<std:: int_least32_t> int_least32_m;
  17616. typedef Mask<std::uint_least32_t> uint_least32_m;
  17617. typedef Mask<std:: int_least16_t> int_least16_m;
  17618. typedef Mask<std::uint_least16_t> uint_least16_m;
  17619. typedef Mask<std:: int_least8_t> int_least8_m;
  17620. typedef Mask<std:: uint_least8_t> uint_least8_m;
  17621. typedef Vector<std:: int_fast64_t> int_fast64_v;
  17622. typedef Vector<std::uint_fast64_t> uint_fast64_v;
  17623. typedef Vector<std:: int_fast32_t> int_fast32_v;
  17624. typedef Vector<std::uint_fast32_t> uint_fast32_v;
  17625. typedef Vector<std:: int_fast16_t> int_fast16_v;
  17626. typedef Vector<std::uint_fast16_t> uint_fast16_v;
  17627. typedef Vector<std:: int_fast8_t> int_fast8_v;
  17628. typedef Vector<std:: uint_fast8_t> uint_fast8_v;
  17629. typedef Mask<std:: int_fast64_t> int_fast64_m;
  17630. typedef Mask<std::uint_fast64_t> uint_fast64_m;
  17631. typedef Mask<std:: int_fast32_t> int_fast32_m;
  17632. typedef Mask<std::uint_fast32_t> uint_fast32_m;
  17633. typedef Mask<std:: int_fast16_t> int_fast16_m;
  17634. typedef Mask<std::uint_fast16_t> uint_fast16_m;
  17635. typedef Mask<std:: int_fast8_t> int_fast8_m;
  17636. typedef Mask<std:: uint_fast8_t> uint_fast8_m;
  17637. #if defined INT64_MAX && defined UINT64_MAX
  17638. typedef Vector<std:: int64_t> int64_v;
  17639. typedef Vector<std::uint64_t> uint64_v;
  17640. typedef Mask<std:: int64_t> int64_m;
  17641. typedef Mask<std::uint64_t> uint64_m;
  17642. #endif
  17643. #if defined INT32_MAX && defined UINT32_MAX
  17644. typedef Vector<std:: int32_t> int32_v;
  17645. typedef Vector<std::uint32_t> uint32_v;
  17646. typedef Mask<std:: int32_t> int32_m;
  17647. typedef Mask<std::uint32_t> uint32_m;
  17648. #endif
  17649. #if defined INT16_MAX && defined UINT16_MAX
  17650. typedef Vector<std:: int16_t> int16_v;
  17651. typedef Vector<std::uint16_t> uint16_v;
  17652. typedef Mask<std:: int16_t> int16_m;
  17653. typedef Mask<std::uint16_t> uint16_m;
  17654. #endif
  17655. #if defined INT8_MAX && defined UINT8_MAX
  17656. typedef Vector<std:: int8_t> int8_v;
  17657. typedef Vector<std::uint8_t> uint8_v;
  17658. typedef Mask<std:: int8_t> int8_m;
  17659. typedef Mask<std::uint8_t> uint8_m;
  17660. #endif
  17661. namespace {
  17662. static_assert(double_v::Size == Vc_DOUBLE_V_SIZE, "Vc_DOUBLE_V_SIZE macro defined to an incorrect value");
  17663. static_assert(float_v::Size == Vc_FLOAT_V_SIZE , "Vc_FLOAT_V_SIZE macro defined to an incorrect value ");
  17664. static_assert(int_v::Size == Vc_INT_V_SIZE , "Vc_INT_V_SIZE macro defined to an incorrect value ");
  17665. static_assert(uint_v::Size == Vc_UINT_V_SIZE , "Vc_UINT_V_SIZE macro defined to an incorrect value ");
  17666. static_assert(short_v::Size == Vc_SHORT_V_SIZE , "Vc_SHORT_V_SIZE macro defined to an incorrect value ");
  17667. static_assert(ushort_v::Size == Vc_USHORT_V_SIZE, "Vc_USHORT_V_SIZE macro defined to an incorrect value");
  17668. }
  17669. }
  17670. #ifndef COMMON_OPERATORS_H_
  17671. #define COMMON_OPERATORS_H_
  17672. #ifndef VC_COMMON_SIMDARRAY_H_
  17673. #define VC_COMMON_SIMDARRAY_H_
  17674. #include <array>
  17675. #ifndef VC_COMMON_SIMDARRAYHELPER_H_
  17676. #define VC_COMMON_SIMDARRAYHELPER_H_
  17677. namespace Vc_VERSIONED_NAMESPACE
  17678. {
  17679. namespace
  17680. {
  17681. static constexpr struct private_init_t {} private_init = {};
  17682. }
  17683. namespace Common
  17684. {
  17685. namespace Operations
  17686. {
  17687. struct tag {};
  17688. #define Vc_DEFINE_OPERATION(name_) \
  17689. struct name_ : public tag { \
  17690. template <typename V, typename... Args> \
  17691. Vc_INTRINSIC void operator()(V &v, Args &&... args) \
  17692. { \
  17693. v.name_(std::forward<Args>(args)...); \
  17694. } \
  17695. }
  17696. Vc_DEFINE_OPERATION(gather);
  17697. Vc_DEFINE_OPERATION(scatter);
  17698. Vc_DEFINE_OPERATION(load);
  17699. Vc_DEFINE_OPERATION(store);
  17700. Vc_DEFINE_OPERATION(setZero);
  17701. Vc_DEFINE_OPERATION(setZeroInverted);
  17702. Vc_DEFINE_OPERATION(assign);
  17703. #undef Vc_DEFINE_OPERATION
  17704. #define Vc_DEFINE_OPERATION(name_,code_) \
  17705. struct name_ : public tag { \
  17706. template <typename V> Vc_INTRINSIC void operator()(V &v) { code_; } \
  17707. }
  17708. Vc_DEFINE_OPERATION(increment, ++(v));
  17709. Vc_DEFINE_OPERATION(decrement, --(v));
  17710. Vc_DEFINE_OPERATION(random, v = V::Random());
  17711. #undef Vc_DEFINE_OPERATION
  17712. #define Vc_DEFINE_OPERATION_FORWARD(name_) \
  17713. struct Forward_##name_ : public tag \
  17714. { \
  17715. template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
  17716. Vc_INTRINSIC void operator()(decltype(name_(std::declval<Args>()...)) &v, \
  17717. Args &&... args) \
  17718. { \
  17719. v = name_(std::forward<Args>(args)...); \
  17720. } \
  17721. template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
  17722. Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \
  17723. { \
  17724. name_(std::forward<Args>(args)...); \
  17725. } \
  17726. }
  17727. Vc_DEFINE_OPERATION_FORWARD(abs);
  17728. Vc_DEFINE_OPERATION_FORWARD(asin);
  17729. Vc_DEFINE_OPERATION_FORWARD(atan);
  17730. Vc_DEFINE_OPERATION_FORWARD(atan2);
  17731. Vc_DEFINE_OPERATION_FORWARD(cos);
  17732. Vc_DEFINE_OPERATION_FORWARD(ceil);
  17733. Vc_DEFINE_OPERATION_FORWARD(copysign);
  17734. Vc_DEFINE_OPERATION_FORWARD(exp);
  17735. Vc_DEFINE_OPERATION_FORWARD(exponent);
  17736. Vc_DEFINE_OPERATION_FORWARD(fma);
  17737. Vc_DEFINE_OPERATION_FORWARD(floor);
  17738. Vc_DEFINE_OPERATION_FORWARD(frexp);
  17739. Vc_DEFINE_OPERATION_FORWARD(isfinite);
  17740. Vc_DEFINE_OPERATION_FORWARD(isinf);
  17741. Vc_DEFINE_OPERATION_FORWARD(isnan);
  17742. Vc_DEFINE_OPERATION_FORWARD(isnegative);
  17743. Vc_DEFINE_OPERATION_FORWARD(ldexp);
  17744. Vc_DEFINE_OPERATION_FORWARD(log);
  17745. Vc_DEFINE_OPERATION_FORWARD(log10);
  17746. Vc_DEFINE_OPERATION_FORWARD(log2);
  17747. Vc_DEFINE_OPERATION_FORWARD(reciprocal);
  17748. Vc_DEFINE_OPERATION_FORWARD(round);
  17749. Vc_DEFINE_OPERATION_FORWARD(rsqrt);
  17750. Vc_DEFINE_OPERATION_FORWARD(sin);
  17751. Vc_DEFINE_OPERATION_FORWARD(sincos);
  17752. Vc_DEFINE_OPERATION_FORWARD(sqrt);
  17753. Vc_DEFINE_OPERATION_FORWARD(trunc);
  17754. Vc_DEFINE_OPERATION_FORWARD(min);
  17755. Vc_DEFINE_OPERATION_FORWARD(max);
  17756. #undef Vc_DEFINE_OPERATION_FORWARD
  17757. template<typename T> using is_operation = std::is_base_of<tag, T>;
  17758. }
  17759. template <typename T_, std::size_t Pieces_, std::size_t Index_> struct Segment
  17760. {
  17761. static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
  17762. using type = T_;
  17763. using type_decayed = typename std::decay<type>::type;
  17764. static constexpr std::size_t Pieces = Pieces_;
  17765. static constexpr std::size_t Index = Index_;
  17766. using fixed_size_type =
  17767. fixed_size_simd<conditional_t<Traits::is_simd_vector<type_decayed>::value,
  17768. typename type_decayed::EntryType, float>,
  17769. type_decayed::Size / Pieces>;
  17770. type data;
  17771. static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces;
  17772. decltype(std::declval<const type &>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
  17773. fixed_size_type to_fixed_size() const
  17774. {
  17775. return simd_cast<fixed_size_type, Index>(data);
  17776. }
  17777. };
  17778. template <typename T_, std::size_t Pieces_, std::size_t Index_>
  17779. struct Segment<T_ *, Pieces_, Index_> {
  17780. static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
  17781. using type = T_ *;
  17782. using type_decayed = typename std::decay<T_>::type;
  17783. static constexpr size_t Pieces = Pieces_;
  17784. static constexpr size_t Index = Index_;
  17785. using fixed_size_type = fixed_size_simd<
  17786. typename std::conditional<Traits::is_simd_vector<type_decayed>::value,
  17787. typename type_decayed::VectorEntryType, float>::type,
  17788. type_decayed::Size / Pieces> *;
  17789. type data;
  17790. static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces;
  17791. fixed_size_type to_fixed_size() const
  17792. {
  17793. return reinterpret_cast<
  17794. #ifdef Vc_GCC
  17795. typename std::remove_pointer<fixed_size_type>::type
  17796. #else
  17797. MayAlias<typename std::remove_pointer<fixed_size_type>::type>
  17798. #endif
  17799. *>(data) +
  17800. Index;
  17801. }
  17802. };
  17803. template <typename T, std::size_t Offset> struct AddOffset
  17804. {
  17805. constexpr AddOffset() = default;
  17806. };
  17807. template <std::size_t secondOffset> class Split
  17808. {
  17809. template <typename U, std::size_t N, typename V, std::size_t M,
  17810. typename = enable_if<N != M>>
  17811. static Vc_INTRINSIC auto loImpl(const SimdArray<U, N, V, M> &x)
  17812. -> decltype(internal_data0(x))
  17813. {
  17814. return internal_data0(x);
  17815. }
  17816. template <typename U, std::size_t N, typename V, std::size_t M,
  17817. typename = enable_if<N != M>>
  17818. static Vc_INTRINSIC auto hiImpl(const SimdArray<U, N, V, M> &x)
  17819. -> decltype(internal_data1(x))
  17820. {
  17821. return internal_data1(x);
  17822. }
  17823. template <typename U, std::size_t N, typename V, std::size_t M,
  17824. typename = enable_if<N != M>>
  17825. static Vc_INTRINSIC auto loImpl(SimdArray<U, N, V, M> *x)
  17826. -> decltype(&internal_data0(*x))
  17827. {
  17828. return &internal_data0(*x);
  17829. }
  17830. template <typename U, std::size_t N, typename V, std::size_t M,
  17831. typename = enable_if<N != M>>
  17832. static Vc_INTRINSIC auto hiImpl(SimdArray<U, N, V, M> *x)
  17833. -> decltype(&internal_data1(*x))
  17834. {
  17835. return &internal_data1(*x);
  17836. }
  17837. template <typename U, std::size_t N, typename V>
  17838. static Vc_INTRINSIC Segment<V, 2, 0> loImpl(const SimdArray<U, N, V, N> &x)
  17839. {
  17840. return {internal_data(x)};
  17841. }
  17842. template <typename U, std::size_t N, typename V>
  17843. static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(const SimdArray<U, N, V, N> &x)
  17844. {
  17845. return {internal_data(x)};
  17846. }
  17847. template <typename U, std::size_t N, typename V>
  17848. static Vc_INTRINSIC Segment<V *, 2, 0> loImpl(SimdArray<U, N, V, N> *x)
  17849. {
  17850. return {&internal_data(*x)};
  17851. }
  17852. template <typename U, std::size_t N, typename V>
  17853. static Vc_INTRINSIC Segment<V *, 2, 1> hiImpl(SimdArray<U, N, V, N> *x)
  17854. {
  17855. return {&internal_data(*x)};
  17856. }
  17857. template <typename U, std::size_t N, typename V, std::size_t M>
  17858. static Vc_INTRINSIC auto loImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data0(x))
  17859. {
  17860. return internal_data0(x);
  17861. }
  17862. template <typename U, std::size_t N, typename V, std::size_t M>
  17863. static Vc_INTRINSIC auto hiImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data1(x))
  17864. {
  17865. return internal_data1(x);
  17866. }
  17867. template <typename U, std::size_t N, typename V>
  17868. static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 0> loImpl(
  17869. const SimdMaskArray<U, N, V, N> &x)
  17870. {
  17871. return {internal_data(x)};
  17872. }
  17873. template <typename U, std::size_t N, typename V>
  17874. static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 1> hiImpl(
  17875. const SimdMaskArray<U, N, V, N> &x)
  17876. {
  17877. return {internal_data(x)};
  17878. }
  17879. #ifdef Vc_IMPL_AVX
  17880. template <class T>
  17881. static Vc_INTRINSIC SSE::Vector<T> loImpl(Vector<T, VectorAbi::Avx> &&x)
  17882. {
  17883. return simd_cast<SSE::Vector<T>, 0>(x);
  17884. }
  17885. template <class T>
  17886. static Vc_INTRINSIC SSE::Vector<T> hiImpl(Vector<T, VectorAbi::Avx> &&x)
  17887. {
  17888. return simd_cast<SSE::Vector<T>, 1>(x);
  17889. }
  17890. template <class T>
  17891. static Vc_INTRINSIC SSE::Mask<T> loImpl(Mask<T, VectorAbi::Avx> &&x)
  17892. {
  17893. return simd_cast<SSE::Mask<T>, 0>(x);
  17894. }
  17895. template <class T>
  17896. static Vc_INTRINSIC SSE::Mask<T> hiImpl(Mask<T, VectorAbi::Avx> &&x)
  17897. {
  17898. return simd_cast<SSE::Mask<T>, 1>(x);
  17899. }
  17900. #endif
  17901. template <typename T>
  17902. static constexpr bool is_vector_or_mask(){
  17903. return (Traits::is_simd_vector<T>::value && !Traits::isSimdArray<T>::value) ||
  17904. (Traits::is_simd_mask<T>::value && !Traits::isSimdMaskArray<T>::value);
  17905. }
  17906. template <typename V>
  17907. static Vc_INTRINSIC Segment<V, 2, 0> loImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
  17908. {
  17909. return {std::forward<V>(x)};
  17910. }
  17911. template <typename V>
  17912. static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
  17913. {
  17914. return {std::forward<V>(x)};
  17915. }
  17916. template <class T, class A>
  17917. static Vc_INTRINSIC const T *loImpl(const std::vector<T, A> &x)
  17918. {
  17919. return x.data();
  17920. }
  17921. template <class T, class A>
  17922. static Vc_INTRINSIC const T *hiImpl(const std::vector<T, A> &x)
  17923. {
  17924. return x.data() + secondOffset;
  17925. }
  17926. template <typename V, std::size_t Pieces, std::size_t Index>
  17927. static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index> loImpl(
  17928. const Segment<V, Pieces, Index> &x)
  17929. {
  17930. return {x.data};
  17931. }
  17932. template <typename V, std::size_t Pieces, std::size_t Index>
  17933. static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index + 1> hiImpl(
  17934. const Segment<V, Pieces, Index> &x)
  17935. {
  17936. return {x.data};
  17937. }
  17938. template <typename T, typename = decltype(loImpl(std::declval<T>()))>
  17939. static std::true_type have_lo_impl(int);
  17940. template <typename T> static std::false_type have_lo_impl(float);
  17941. template <typename T> static constexpr bool have_lo_impl()
  17942. {
  17943. return decltype(have_lo_impl<T>(1))::value;
  17944. }
  17945. template <typename T, typename = decltype(hiImpl(std::declval<T>()))>
  17946. static std::true_type have_hi_impl(int);
  17947. template <typename T> static std::false_type have_hi_impl(float);
  17948. template <typename T> static constexpr bool have_hi_impl()
  17949. {
  17950. return decltype(have_hi_impl<T>(1))::value;
  17951. }
  17952. public:
  17953. template <typename U>
  17954. static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr)
  17955. {
  17956. return ptr;
  17957. }
  17958. template <typename U>
  17959. static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr)
  17960. {
  17961. return ptr + secondOffset;
  17962. }
  17963. template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
  17964. static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>()))
  17965. lo(Operations::gather, U &&x)
  17966. {
  17967. return loImpl(std::forward<U>(x));
  17968. }
  17969. template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
  17970. static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>()))
  17971. hi(Operations::gather, U &&x)
  17972. {
  17973. return hiImpl(std::forward<U>(x));
  17974. }
  17975. template <typename U>
  17976. static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr)
  17977. {
  17978. return ptr;
  17979. }
  17980. template <typename U>
  17981. static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr)
  17982. {
  17983. return ptr + secondOffset;
  17984. }
  17985. template <typename U>
  17986. static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>())) lo(U &&x)
  17987. {
  17988. return loImpl(std::forward<U>(x));
  17989. }
  17990. template <typename U>
  17991. static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>())) hi(U &&x)
  17992. {
  17993. return hiImpl(std::forward<U>(x));
  17994. }
  17995. template <typename U>
  17996. static Vc_ALWAYS_INLINE enable_if<!have_lo_impl<U>(), U> lo(U &&x)
  17997. {
  17998. return std::forward<U>(x);
  17999. }
  18000. template <typename U>
  18001. static Vc_ALWAYS_INLINE enable_if<!have_hi_impl<U>(), U> hi(U &&x)
  18002. {
  18003. return std::forward<U>(x);
  18004. }
  18005. };
  18006. template <typename Op, typename U, std::size_t M, typename V>
  18007. static Vc_INTRINSIC const V &actual_value(Op, const SimdArray<U, M, V, M> &x)
  18008. {
  18009. return internal_data(x);
  18010. }
  18011. template <typename Op, typename U, std::size_t M, typename V>
  18012. static Vc_INTRINSIC V *actual_value(Op, SimdArray<U, M, V, M> *x)
  18013. {
  18014. return &internal_data(*x);
  18015. }
  18016. template <typename Op, typename T, size_t Pieces, size_t Index>
  18017. static Vc_INTRINSIC typename Segment<T, Pieces, Index>::fixed_size_type actual_value(
  18018. Op, Segment<T, Pieces, Index> &&seg)
  18019. {
  18020. return seg.to_fixed_size();
  18021. }
  18022. template <typename Op, typename U, std::size_t M, typename V>
  18023. static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray<U, M, V, M> &x)
  18024. {
  18025. return internal_data(x);
  18026. }
  18027. template <typename Op, typename U, std::size_t M, typename V>
  18028. static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray<U, M, V, M> *x)
  18029. {
  18030. return &internal_data(*x);
  18031. }
  18032. template <typename Op, typename Arg>
  18033. Vc_INTRINSIC decltype(actual_value(std::declval<Op &>(), std::declval<Arg>()))
  18034. conditionalUnpack(std::true_type, Op op, Arg &&arg)
  18035. {
  18036. return actual_value(op, std::forward<Arg>(arg));
  18037. }
  18038. template <typename Op, typename Arg>
  18039. Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg)
  18040. {
  18041. return std::forward<Arg>(arg);
  18042. }
  18043. template <size_t A, size_t B>
  18044. struct selectorType : public std::integral_constant<bool, !((A & (size_t(1) << B)) != 0)> {
  18045. };
  18046. template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
  18047. Vc_INTRINSIC decltype(std::declval<Op &>()(std::declval<R &>(),
  18048. conditionalUnpack(selectorType<I, Indexes>(),
  18049. std::declval<Op &>(),
  18050. std::declval<Args>())...))
  18051. unpackArgumentsAutoImpl(int, index_sequence<Indexes...>, Op op, R &&r, Args &&... args)
  18052. {
  18053. op(std::forward<R>(r),
  18054. conditionalUnpack(selectorType<I, Indexes>(), op, std::forward<Args>(args))...);
  18055. }
  18056. template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
  18057. Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl(
  18058. float, index_sequence<Indexes...> is, Op op, R &&r, Args &&... args)
  18059. {
  18060. static_assert(
  18061. I < (1 << sizeof...(Args)) - (std::is_same<R, std::nullptr_t>::value ? 1 : 0),
  18062. "Vc or compiler bug. Please report. Failed to find a combination of "
  18063. "actual_value(arg) transformations that allows calling Op.");
  18064. unpackArgumentsAutoImpl<I + 1, Op, R, Args...>(int(), is, op, std::forward<R>(r),
  18065. std::forward<Args>(args)...);
  18066. }
  18067. #ifdef Vc_ICC
  18068. template <size_t, typename... Ts> struct IccWorkaround {
  18069. using type = void;
  18070. };
  18071. template <typename... Ts> struct IccWorkaround<2, Ts...> {
  18072. using type = typename std::remove_pointer<typename std::decay<
  18073. typename std::tuple_element<1, std::tuple<Ts...>>::type>::type>::type;
  18074. };
  18075. #endif
  18076. template <typename Op, typename R, typename... Args>
  18077. Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args)
  18078. {
  18079. #ifdef Vc_ICC
  18080. const int recursionStart =
  18081. Traits::isSimdArray<
  18082. typename IccWorkaround<sizeof...(Args), Args...>::type>::value &&
  18083. (std::is_same<Op, Common::Operations::Forward_frexp>::value ||
  18084. std::is_same<Op, Common::Operations::Forward_ldexp>::value)
  18085. ? 2
  18086. : 0;
  18087. #else
  18088. const int recursionStart = 0;
  18089. #endif
  18090. unpackArgumentsAutoImpl<recursionStart>(
  18091. int(), make_index_sequence<sizeof...(Args)>(), op, std::forward<R>(r),
  18092. std::forward<Args>(args)...);
  18093. }
  18094. }
  18095. }
  18096. #endif
  18097. #ifndef VC_COMMON_SIMDMASKARRAY_H_
  18098. #define VC_COMMON_SIMDMASKARRAY_H_
  18099. #include <type_traits>
  18100. #include <array>
  18101. namespace Vc_VERSIONED_NAMESPACE
  18102. {
  18103. template <typename T, std::size_t N, typename VectorType_>
  18104. class SimdMaskArray<T, N, VectorType_, N>
  18105. {
  18106. public:
  18107. using VectorType = VectorType_;
  18108. using vector_type = VectorType;
  18109. using mask_type = typename vector_type::Mask;
  18110. using storage_type = mask_type;
  18111. friend storage_type &internal_data(SimdMaskArray &m) { return m.data; }
  18112. friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; }
  18113. static constexpr std::size_t size() { return N; }
  18114. static constexpr std::size_t Size = size();
  18115. static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
  18116. static_assert(Size == vector_type::Size, "size mismatch");
  18117. using vectorentry_type = typename mask_type::VectorEntryType;
  18118. using value_type = typename mask_type::EntryType;
  18119. using Mask = mask_type;
  18120. using VectorEntryType = vectorentry_type;
  18121. using EntryType = value_type;
  18122. using EntryReference = Vc::Detail::ElementReference<storage_type, SimdMaskArray>;
  18123. using reference = EntryReference;
  18124. using Vector = fixed_size_simd<T, N>;
  18125. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
  18126. SimdMaskArray() = default;
  18127. SimdMaskArray(const SimdMaskArray &) = default;
  18128. SimdMaskArray(SimdMaskArray &&) = default;
  18129. SimdMaskArray &operator=(const SimdMaskArray &) = default;
  18130. SimdMaskArray &operator=(SimdMaskArray &&) = default;
  18131. Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {}
  18132. Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {}
  18133. Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {}
  18134. Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; }
  18135. Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; }
  18136. template <class U, class V, class = enable_if<N == V::Size>>
  18137. Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
  18138. template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
  18139. class = U>
  18140. Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
  18141. template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
  18142. class = U, class = U>
  18143. Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
  18144. template <typename M, std::size_t Pieces, std::size_t Index>
  18145. Vc_INTRINSIC_L SimdMaskArray(
  18146. Common::Segment<M, Pieces, Index> &&x,
  18147. enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R;
  18148. template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
  18149. !Traits::isSimdMaskArray<M>::value &&
  18150. Traits::simd_vector_size<M>::value == Size)>>
  18151. Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R;
  18152. template <class U, class A,
  18153. class = enable_if<Vc::Mask<U, A>::Size == N &&
  18154. !detail::is_fixed_size_abi<A>::value>>
  18155. operator Vc::Mask<U, A>() const
  18156. {
  18157. return simd_cast<Vc::Mask<U, A>>(data);
  18158. }
  18159. operator fixed_size_simd_mask<T, N> &()
  18160. {
  18161. return static_cast<fixed_size_simd_mask<T, N> &>(*this);
  18162. }
  18163. operator const fixed_size_simd_mask<T, N> &() const
  18164. {
  18165. return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
  18166. }
  18167. template <typename Flags = DefaultLoadTag>
  18168. Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
  18169. : data(mem, f)
  18170. {
  18171. }
  18172. Vc_INTRINSIC void load(const bool *mem) { data.load(mem); }
  18173. template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
  18174. {
  18175. data.load(mem, f);
  18176. }
  18177. Vc_INTRINSIC void store(bool *mem) const { data.store(mem); }
  18178. template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
  18179. {
  18180. data.store(mem, f);
  18181. }
  18182. Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const
  18183. {
  18184. return data == rhs.data;
  18185. }
  18186. Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const
  18187. {
  18188. return data != rhs.data;
  18189. }
  18190. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
  18191. {
  18192. return {private_init, !data};
  18193. }
  18194. Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
  18195. {
  18196. data &= rhs.data;
  18197. return *this;
  18198. }
  18199. Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
  18200. {
  18201. data |= rhs.data;
  18202. return *this;
  18203. }
  18204. Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
  18205. {
  18206. data ^= rhs.data;
  18207. return *this;
  18208. }
  18209. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
  18210. const SimdMaskArray &rhs) const
  18211. {
  18212. return {private_init, data & rhs.data};
  18213. }
  18214. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
  18215. const SimdMaskArray &rhs) const
  18216. {
  18217. return {private_init, data | rhs.data};
  18218. }
  18219. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
  18220. const SimdMaskArray &rhs) const
  18221. {
  18222. return {private_init, data ^ rhs.data};
  18223. }
  18224. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
  18225. const SimdMaskArray &rhs) const
  18226. {
  18227. return {private_init, data && rhs.data};
  18228. }
  18229. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
  18230. const SimdMaskArray &rhs) const
  18231. {
  18232. return {private_init, data || rhs.data};
  18233. }
  18234. Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); }
  18235. Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); }
  18236. Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); }
  18237. Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); }
  18238. Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); }
  18239. Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); }
  18240. private:
  18241. friend reference;
  18242. static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept
  18243. {
  18244. return k[i];
  18245. }
  18246. template <typename U>
  18247. static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept(
  18248. noexcept(std::declval<storage_type &>()[0] = std::declval<U>()))
  18249. {
  18250. k[i] = std::forward<U>(v);
  18251. }
  18252. public:
  18253. Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
  18254. {
  18255. return {data, int(index)};
  18256. }
  18257. Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
  18258. {
  18259. return data[index];
  18260. }
  18261. Vc_INTRINSIC Vc_PURE int count() const { return data.count(); }
  18262. Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); }
  18263. template <typename G>
  18264. static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
  18265. {
  18266. return {private_init, mask_type::generate(gen)};
  18267. }
  18268. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
  18269. {
  18270. return {private_init, data.shifted(amount)};
  18271. }
  18272. template <typename Op, typename... Args>
  18273. static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
  18274. {
  18275. fixed_size_simd_mask<T, N> r;
  18276. Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
  18277. return r;
  18278. }
  18279. Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {}
  18280. private:
  18281. alignas(static_cast<std::size_t>(
  18282. Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
  18283. VectorType_::size()>::value)) storage_type data;
  18284. };
  18285. template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::Size;
  18286. template <typename T, std::size_t N, typename VectorType>
  18287. constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::MemoryAlignment;
  18288. template <typename T, size_t N, typename V, size_t Wt>
  18289. class SimdMaskArray
  18290. {
  18291. static constexpr std::size_t N0 = Common::left_size<N>();
  18292. using Split = Common::Split<N0>;
  18293. public:
  18294. using storage_type0 = fixed_size_simd_mask<T, N0>;
  18295. using storage_type1 = fixed_size_simd_mask<T, N - N0>;
  18296. static_assert(storage_type0::size() == N0, "");
  18297. using vector_type = fixed_size_simd<T, N>;
  18298. friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; }
  18299. friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; }
  18300. friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; }
  18301. friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; }
  18302. using mask_type = SimdMaskArray;
  18303. static constexpr std::size_t size() { return N; }
  18304. static constexpr std::size_t Size = size();
  18305. static constexpr std::size_t MemoryAlignment =
  18306. storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
  18307. ? storage_type0::MemoryAlignment
  18308. : storage_type1::MemoryAlignment;
  18309. static_assert(Size == vector_type::Size, "size mismatch");
  18310. using vectorentry_type = typename storage_type0::VectorEntryType;
  18311. using value_type = typename storage_type0::EntryType;
  18312. using MaskType = mask_type;
  18313. using VectorEntryType = vectorentry_type;
  18314. using EntryType = value_type;
  18315. using EntryReference = Vc::Detail::ElementReference<SimdMaskArray>;
  18316. using reference = EntryReference;
  18317. using Vector = fixed_size_simd<T, N>;
  18318. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
  18319. SimdMaskArray() = default;
  18320. SimdMaskArray(const SimdMaskArray &) = default;
  18321. SimdMaskArray(SimdMaskArray &&) = default;
  18322. SimdMaskArray &operator=(const SimdMaskArray &) = default;
  18323. SimdMaskArray &operator=(SimdMaskArray &&) = default;
  18324. template <typename U, typename W>
  18325. Vc_INTRINSIC SimdMaskArray(const SimdMaskArray<U, N, W> &rhs)
  18326. : data0(Split::lo(rhs)), data1(Split::hi(rhs))
  18327. {
  18328. }
  18329. template <typename M, std::size_t Pieces, std::size_t Index>
  18330. Vc_INTRINSIC SimdMaskArray(
  18331. Common::Segment<M, Pieces, Index> &&rhs,
  18332. enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg)
  18333. : data0(Split::lo(rhs)), data1(Split::hi(rhs))
  18334. {
  18335. }
  18336. template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
  18337. !Traits::isSimdMaskArray<M>::value &&
  18338. Traits::simd_vector_size<M>::value == Size)>>
  18339. Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k))
  18340. {
  18341. }
  18342. template <class U, class A,
  18343. class = enable_if<Vc::Mask<U, A>::Size == N &&
  18344. !detail::is_fixed_size_abi<A>::value>>
  18345. operator Vc::Mask<U, A>() const
  18346. {
  18347. return simd_cast<Vc::Mask<U, A>>(data0, data1);
  18348. }
  18349. Vc_INTRINSIC operator fixed_size_simd_mask<T, N> &()
  18350. {
  18351. return static_cast<fixed_size_simd_mask<T, N> &>(*this);
  18352. }
  18353. Vc_INTRINSIC operator const fixed_size_simd_mask<T, N> &() const
  18354. {
  18355. return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
  18356. }
  18357. Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one)
  18358. : data0(one), data1(one)
  18359. {
  18360. }
  18361. Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero)
  18362. : data0(zero), data1(zero)
  18363. {
  18364. }
  18365. Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {}
  18366. Vc_INTRINSIC static fixed_size_simd_mask<T, N> Zero()
  18367. {
  18368. return {storage_type0::Zero(), storage_type1::Zero()};
  18369. }
  18370. Vc_INTRINSIC static fixed_size_simd_mask<T, N> One()
  18371. {
  18372. return {storage_type0::One(), storage_type1::One()};
  18373. }
  18374. template <typename Flags = DefaultLoadTag>
  18375. Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
  18376. : data0(mem, f), data1(mem + storage_type0::size(), f)
  18377. {
  18378. }
  18379. Vc_INTRINSIC void load(const bool *mem)
  18380. {
  18381. data0.load(mem);
  18382. data1.load(mem + storage_type0::size());
  18383. }
  18384. template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
  18385. {
  18386. data0.load(mem, f);
  18387. data1.load(mem + storage_type0::size(), f);
  18388. }
  18389. Vc_INTRINSIC void store(bool *mem) const
  18390. {
  18391. data0.store(mem);
  18392. data1.store(mem + storage_type0::size());
  18393. }
  18394. template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
  18395. {
  18396. data0.store(mem, f);
  18397. data1.store(mem + storage_type0::size(), f);
  18398. }
  18399. Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const
  18400. {
  18401. return data0 == mask.data0 && data1 == mask.data1;
  18402. }
  18403. Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const
  18404. {
  18405. return data0 != mask.data0 || data1 != mask.data1;
  18406. }
  18407. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
  18408. {
  18409. return {!data0, !data1};
  18410. }
  18411. Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
  18412. {
  18413. data0 &= rhs.data0;
  18414. data1 &= rhs.data1;
  18415. return *this;
  18416. }
  18417. Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
  18418. {
  18419. data0 |= rhs.data0;
  18420. data1 |= rhs.data1;
  18421. return *this;
  18422. }
  18423. Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
  18424. {
  18425. data0 ^= rhs.data0;
  18426. data1 ^= rhs.data1;
  18427. return *this;
  18428. }
  18429. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
  18430. const SimdMaskArray &rhs) const
  18431. {
  18432. return {data0 & rhs.data0, data1 & rhs.data1};
  18433. }
  18434. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
  18435. const SimdMaskArray &rhs) const
  18436. {
  18437. return {data0 | rhs.data0, data1 | rhs.data1};
  18438. }
  18439. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
  18440. const SimdMaskArray &rhs) const
  18441. {
  18442. return {data0 ^ rhs.data0, data1 ^ rhs.data1};
  18443. }
  18444. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
  18445. const SimdMaskArray &rhs) const
  18446. {
  18447. return {data0 && rhs.data0, data1 && rhs.data1};
  18448. }
  18449. Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
  18450. const SimdMaskArray &rhs) const
  18451. {
  18452. return {data0 || rhs.data0, data1 || rhs.data1};
  18453. }
  18454. Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); }
  18455. Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); }
  18456. Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); }
  18457. Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); }
  18458. Vc_INTRINSIC Vc_PURE int toInt() const
  18459. {
  18460. return data0.toInt() | (data1.toInt() << data0.size());
  18461. }
  18462. private:
  18463. friend reference;
  18464. static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept
  18465. {
  18466. if (i < int(o.data0.size())) {
  18467. return o.data0[i];
  18468. } else {
  18469. return o.data1[i - o.data0.size()];
  18470. }
  18471. }
  18472. template <typename U>
  18473. static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept(
  18474. noexcept(std::declval<storage_type0 &>()[0] = std::declval<U>()) &&
  18475. noexcept(std::declval<storage_type1 &>()[0] = std::declval<U>()))
  18476. {
  18477. if (i < int(o.data0.size())) {
  18478. o.data0[i] = std::forward<U>(v);
  18479. } else {
  18480. o.data1[i - o.data0.size()] = std::forward<U>(v);
  18481. }
  18482. }
  18483. public:
  18484. Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
  18485. {
  18486. return {*this, int(index)};
  18487. }
  18488. Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
  18489. {
  18490. return get(*this, index);
  18491. }
  18492. Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); }
  18493. Vc_INTRINSIC Vc_PURE int firstOne() const {
  18494. if (data0.isEmpty()) {
  18495. return data1.firstOne() + storage_type0::size();
  18496. }
  18497. return data0.firstOne();
  18498. }
  18499. template <typename G>
  18500. static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
  18501. {
  18502. return {storage_type0::generate(gen),
  18503. storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
  18504. }
  18505. inline Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
  18506. {
  18507. if (Vc_IS_UNLIKELY(amount == 0)) {
  18508. return *this;
  18509. }
  18510. return generate([&](unsigned i) {
  18511. const unsigned j = i + amount;
  18512. return j < size() ? get(*this, j) : false;
  18513. });
  18514. }
  18515. template <typename Op, typename... Args>
  18516. static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
  18517. {
  18518. fixed_size_simd_mask<T, N> r = {
  18519. storage_type0::fromOperation(op, Split::lo(args)...),
  18520. storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
  18521. return r;
  18522. }
  18523. Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y)
  18524. : data0(std::move(x)), data1(std::move(y))
  18525. {
  18526. }
  18527. private:
  18528. alignas(static_cast<std::size_t>(
  18529. Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
  18530. V::size()>::value)) storage_type0 data0;
  18531. storage_type1 data1;
  18532. };
  18533. template <typename T, std::size_t N, typename V, std::size_t M>
  18534. constexpr std::size_t SimdMaskArray<T, N, V, M>::Size;
  18535. template <typename T, std::size_t N, typename V, std::size_t M>
  18536. constexpr std::size_t SimdMaskArray<T, N, V, M>::MemoryAlignment;
  18537. }
  18538. #ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_
  18539. #define VC_COMMON_SIMD_CAST_CALLER_TCC_
  18540. namespace Vc_VERSIONED_NAMESPACE {
  18541. template <class T, std::size_t N, class VectorType>
  18542. template <class U, class V, class>
  18543. Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
  18544. const SimdMaskArray<U, N, V> &x)
  18545. : data(simd_cast<mask_type>(internal_data(x)))
  18546. {
  18547. }
  18548. template <class T, std::size_t N, class VectorType>
  18549. template <class U, class V, class, class>
  18550. Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
  18551. const SimdMaskArray<U, N, V> &x)
  18552. : data(simd_cast<mask_type>(internal_data(internal_data0(x)),
  18553. internal_data(internal_data1(x))))
  18554. {
  18555. }
  18556. template <class T, std::size_t N, class VectorType>
  18557. template <class U, class V, class, class, class>
  18558. Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
  18559. const SimdMaskArray<U, N, V> &x)
  18560. : data(simd_cast<mask_type>(internal_data(internal_data0(internal_data0(x))),
  18561. internal_data(internal_data1(internal_data0(x))),
  18562. internal_data(internal_data0(internal_data1(x))),
  18563. internal_data(internal_data1(internal_data1(x)))))
  18564. {
  18565. }
  18566. template <class T, std::size_t N, class VectorType>
  18567. template <class M, std::size_t Pieces, std::size_t Index>
  18568. Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
  18569. Common::Segment<M, Pieces, Index> &&x,
  18570. enable_if<Traits::simd_vector_size<M>::value == Size * Pieces>)
  18571. : data(simd_cast<mask_type, Index>(x.data))
  18572. {
  18573. }
  18574. template <class T, std::size_t N, class VectorType>
  18575. template <class M, class>
  18576. Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(M k)
  18577. : data(simd_cast<mask_type>(k))
  18578. {
  18579. }
  18580. }
  18581. #endif
  18582. #endif
  18583. #ifndef VC_COMMON_INTERLEAVE_H_
  18584. #define VC_COMMON_INTERLEAVE_H_
  18585. namespace Vc_VERSIONED_NAMESPACE
  18586. {
  18587. template <typename V, typename = enable_if<Traits::is_simd_vector<V>::value>>
  18588. std::pair<V, V> interleave(const V &a, const V &b)
  18589. {
  18590. return {a.interleaveLow(b), a.interleaveHigh(b)};
  18591. }
  18592. }
  18593. #endif
  18594. namespace Vc_VERSIONED_NAMESPACE
  18595. {
  18596. namespace Common
  18597. {
  18598. template <std::size_t N, class... Candidates> struct select_best_vector_type_impl;
  18599. template <std::size_t N, class T> struct select_best_vector_type_impl<N, T> {
  18600. using type = T;
  18601. };
  18602. template <std::size_t N, class T, class... Candidates>
  18603. struct select_best_vector_type_impl<N, T, Candidates...> {
  18604. using type = typename std::conditional<
  18605. (N < T::Size), typename select_best_vector_type_impl<N, Candidates...>::type,
  18606. T>::type;
  18607. };
  18608. template <class T, std::size_t N>
  18609. struct select_best_vector_type : select_best_vector_type_impl<N,
  18610. #ifdef Vc_IMPL_AVX2
  18611. Vc::AVX2::Vector<T>,
  18612. #elif defined Vc_IMPL_AVX
  18613. Vc::AVX::Vector<T>,
  18614. #endif
  18615. #ifdef Vc_IMPL_SSE
  18616. Vc::SSE::Vector<T>,
  18617. #endif
  18618. Vc::Scalar::Vector<T>> {
  18619. };
  18620. }
  18621. namespace internal
  18622. {
  18623. template <typename T> T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; }
  18624. template <typename T> T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; }
  18625. }
  18626. template <typename T, std::size_t N, typename V, std::size_t M>
  18627. inline fixed_size_simd<T, N> min(const SimdArray<T, N, V, M> &x,
  18628. const SimdArray<T, N, V, M> &y);
  18629. template <typename T, std::size_t N, typename V, std::size_t M>
  18630. inline fixed_size_simd<T, N> max(const SimdArray<T, N, V, M> &x,
  18631. const SimdArray<T, N, V, M> &y);
  18632. #define Vc_CURRENT_CLASS_NAME SimdArray
  18633. template <typename T, std::size_t N, typename VectorType_>
  18634. class SimdArray<T, N, VectorType_, N>
  18635. {
  18636. static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value ||
  18637. std::is_same<T, int32_t>::value ||
  18638. std::is_same<T, uint32_t>::value ||
  18639. std::is_same<T, int16_t>::value ||
  18640. std::is_same<T, uint16_t>::value,
  18641. "SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, "
  18642. "int16_t, uint16_t }");
  18643. static_assert(
  18644. std::is_same<VectorType_,
  18645. typename Common::select_best_vector_type<T, N>::type>::value &&
  18646. VectorType_::size() == N,
  18647. "ERROR: leave the third and fourth template parameters with their defaults. They "
  18648. "are implementation details.");
  18649. public:
  18650. static constexpr bool is_atomic = true;
  18651. using VectorType = VectorType_;
  18652. using vector_type = VectorType;
  18653. using storage_type = vector_type;
  18654. using vectorentry_type = typename vector_type::VectorEntryType;
  18655. using value_type = T;
  18656. using mask_type = fixed_size_simd_mask<T, N>;
  18657. using index_type = fixed_size_simd<int, N>;
  18658. static constexpr std::size_t size() { return N; }
  18659. using Mask = mask_type;
  18660. using MaskType = Mask;
  18661. using MaskArgument = const MaskType &;
  18662. using VectorEntryType = vectorentry_type;
  18663. using EntryType = value_type;
  18664. using IndexType = index_type;
  18665. using AsArg = const SimdArray &;
  18666. using reference = Detail::ElementReference<SimdArray>;
  18667. static constexpr std::size_t Size = size();
  18668. static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
  18669. Vc_INTRINSIC SimdArray() = default;
  18670. Vc_INTRINSIC SimdArray(const SimdArray &) = default;
  18671. Vc_INTRINSIC SimdArray(SimdArray &&) = default;
  18672. Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default;
  18673. Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {}
  18674. Vc_INTRINSIC SimdArray(value_type &a) : data(a) {}
  18675. Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {}
  18676. template <
  18677. typename U,
  18678. typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
  18679. Vc_INTRINSIC SimdArray(U a)
  18680. : SimdArray(static_cast<value_type>(a))
  18681. {
  18682. }
  18683. template <class U, class V, class = enable_if<N == V::Size>>
  18684. Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
  18685. : data(simd_cast<vector_type>(internal_data(x)))
  18686. {
  18687. }
  18688. template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
  18689. class = U>
  18690. Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
  18691. : data(simd_cast<vector_type>(internal_data(internal_data0(x)),
  18692. internal_data(internal_data1(x))))
  18693. {
  18694. }
  18695. template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
  18696. class = U, class = U>
  18697. Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
  18698. : data(simd_cast<vector_type>(internal_data(internal_data0(internal_data0(x))),
  18699. internal_data(internal_data1(internal_data0(x))),
  18700. internal_data(internal_data0(internal_data1(x))),
  18701. internal_data(internal_data1(internal_data1(x)))))
  18702. {
  18703. }
  18704. template <typename V, std::size_t Pieces, std::size_t Index>
  18705. Vc_INTRINSIC SimdArray(Common::Segment<V, Pieces, Index> &&x)
  18706. : data(simd_cast<vector_type, Index>(x.data))
  18707. {
  18708. }
  18709. Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
  18710. : data(init.begin(), Vc::Unaligned)
  18711. {
  18712. Vc_ASSERT(init.size() == size());
  18713. }
  18714. template <
  18715. typename V,
  18716. typename = enable_if<Traits::is_simd_vector<V>::value && !Traits::isSimdArray<V>::value>>
  18717. Vc_INTRINSIC SimdArray(const V &x)
  18718. : data(simd_cast<vector_type>(x))
  18719. {
  18720. }
  18721. template <typename U, typename A,
  18722. typename =
  18723. enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
  18724. !std::is_same<A, simd_abi::fixed_size<N>>::value>>
  18725. Vc_INTRINSIC operator Vector<U, A>() const
  18726. {
  18727. return simd_cast<Vector<U, A>>(data);
  18728. }
  18729. operator fixed_size_simd<T, N> &()
  18730. {
  18731. return static_cast<fixed_size_simd<T, N> &>(*this);
  18732. }
  18733. operator const fixed_size_simd<T, N> &() const
  18734. {
  18735. return static_cast<const fixed_size_simd<T, N> &>(*this);
  18736. }
  18737. #ifndef Vc_CURRENT_CLASS_NAME
  18738. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  18739. #endif
  18740. private:
  18741. template <class MT, class IT, int Scale = 1>
  18742. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  18743. template <class MT, class IT, int Scale = 1>
  18744. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  18745. MaskArgument mask);
  18746. public:
  18747. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  18748. static_assert( \
  18749. std::is_convertible<MT, EntryType>::value, \
  18750. "The memory pointer needs to point to a type that can be converted to the " \
  18751. "EntryType of this SIMD vector type."); \
  18752. static_assert( \
  18753. Vc::Traits::has_subscript_operator<IT>::value, \
  18754. "The indexes argument must be a type that implements the subscript operator."); \
  18755. static_assert( \
  18756. !Traits::is_simd_vector<IT>::value || \
  18757. Traits::simd_vector_size<IT>::value >= Size, \
  18758. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  18759. "have at least as many entries as this SIMD vector."); \
  18760. static_assert( \
  18761. !std::is_array<T>::value || \
  18762. (std::rank<T>::value == 1 && \
  18763. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  18764. "If you use a simple array for the indexes parameter, the array must have " \
  18765. "at least as many entries as this SIMD vector.")
  18766. template <typename MT, typename IT,
  18767. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  18768. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  18769. {
  18770. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18771. gatherImplementation(
  18772. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  18773. }
  18774. template <class MT, class IT, int Scale>
  18775. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  18776. {
  18777. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18778. gatherImplementation(args);
  18779. }
  18780. template <typename MT, typename IT,
  18781. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  18782. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  18783. MaskArgument mask)
  18784. {
  18785. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18786. gatherImplementation(
  18787. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  18788. }
  18789. template <class MT, class IT, int Scale>
  18790. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  18791. MaskArgument mask)
  18792. {
  18793. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18794. gatherImplementation(args, mask);
  18795. }
  18796. template <typename MT, typename IT,
  18797. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  18798. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  18799. {
  18800. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18801. gatherImplementation(
  18802. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  18803. }
  18804. template <typename MT, typename IT,
  18805. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  18806. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  18807. {
  18808. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18809. gatherImplementation(
  18810. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  18811. }
  18812. template <class MT, class IT, int Scale>
  18813. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  18814. {
  18815. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18816. gatherImplementation(args);
  18817. }
  18818. template <class MT, class IT, int Scale>
  18819. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  18820. MaskArgument mask)
  18821. {
  18822. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  18823. gatherImplementation(args, mask);
  18824. }
  18825. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  18826. private:
  18827. template <typename MT, typename IT>
  18828. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  18829. template <typename MT, typename IT>
  18830. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  18831. public:
  18832. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  18833. static_assert( \
  18834. std::is_convertible<EntryType, MT>::value, \
  18835. "The memory pointer needs to point to a type that the EntryType of this " \
  18836. "SIMD vector type can be converted to."); \
  18837. static_assert( \
  18838. Vc::Traits::has_subscript_operator<IT>::value, \
  18839. "The indexes argument must be a type that implements the subscript operator."); \
  18840. static_assert( \
  18841. !Traits::is_simd_vector<IT>::value || \
  18842. Traits::simd_vector_size<IT>::value >= Size, \
  18843. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  18844. "have at least as many entries as this SIMD vector."); \
  18845. static_assert( \
  18846. !std::is_array<T>::value || \
  18847. (std::rank<T>::value == 1 && \
  18848. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  18849. "If you use a simple array for the indexes parameter, the array must have " \
  18850. "at least as many entries as this SIMD vector.")
  18851. template <typename MT,
  18852. typename IT,
  18853. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  18854. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  18855. {
  18856. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  18857. scatterImplementation(mem, std::forward<IT>(indexes));
  18858. }
  18859. template <typename MT,
  18860. typename IT,
  18861. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  18862. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  18863. {
  18864. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  18865. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  18866. }
  18867. template <typename MT, typename IT>
  18868. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  18869. {
  18870. scatter(args.address, args.indexes);
  18871. }
  18872. template <typename MT, typename IT>
  18873. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  18874. {
  18875. scatter(args.address, args.indexes, mask);
  18876. }
  18877. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  18878. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data() {}
  18879. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data(o) {}
  18880. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i) : data(i)
  18881. {
  18882. }
  18883. template <std::size_t Offset>
  18884. explicit Vc_INTRINSIC SimdArray(
  18885. Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset>)
  18886. : data(Vc::IndexesFromZero)
  18887. {
  18888. data += value_type(Offset);
  18889. }
  18890. Vc_INTRINSIC void setZero() { data.setZero(); }
  18891. Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); }
  18892. Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); }
  18893. Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); }
  18894. Vc_INTRINSIC void setQnan() { data.setQnan(); }
  18895. Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); }
  18896. template <typename Op, typename... Args>
  18897. static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
  18898. {
  18899. fixed_size_simd<T, N> r;
  18900. Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
  18901. return r;
  18902. }
  18903. template <typename Op, typename... Args>
  18904. static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
  18905. {
  18906. Common::unpackArgumentsAuto(op, nullptr, std::forward<Args>(args)...);
  18907. }
  18908. static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
  18909. {
  18910. return SimdArray(Vc::Zero);
  18911. }
  18912. static Vc_INTRINSIC fixed_size_simd<T, N> One()
  18913. {
  18914. return SimdArray(Vc::One);
  18915. }
  18916. static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
  18917. {
  18918. return SimdArray(Vc::IndexesFromZero);
  18919. }
  18920. static Vc_INTRINSIC fixed_size_simd<T, N> Random()
  18921. {
  18922. return fromOperation(Common::Operations::random());
  18923. }
  18924. template <class U, class Flags = DefaultLoadTag,
  18925. class = enable_if<std::is_arithmetic<U>::value &&
  18926. Traits::is_load_store_flag<Flags>::value>>
  18927. explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags()) : data(mem, f)
  18928. {
  18929. }
  18930. template <typename... Args> Vc_INTRINSIC void load(Args &&... args)
  18931. {
  18932. data.load(std::forward<Args>(args)...);
  18933. }
  18934. template <typename... Args> Vc_INTRINSIC void store(Args &&... args) const
  18935. {
  18936. data.store(std::forward<Args>(args)...);
  18937. }
  18938. Vc_INTRINSIC mask_type operator!() const
  18939. {
  18940. return {private_init, !data};
  18941. }
  18942. Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
  18943. {
  18944. return {private_init, -data};
  18945. }
  18946. Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
  18947. Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
  18948. {
  18949. return {private_init, ~data};
  18950. }
  18951. template <typename U,
  18952. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  18953. Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
  18954. {
  18955. return {private_init, data << x};
  18956. }
  18957. template <typename U,
  18958. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  18959. Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
  18960. {
  18961. data <<= x;
  18962. return *this;
  18963. }
  18964. template <typename U,
  18965. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  18966. Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
  18967. {
  18968. return {private_init, data >> x};
  18969. }
  18970. template <typename U,
  18971. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  18972. Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
  18973. {
  18974. data >>= x;
  18975. return *this;
  18976. }
  18977. #define Vc_BINARY_OPERATOR_(op) \
  18978. Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
  18979. { \
  18980. data op## = rhs.data; \
  18981. return *this; \
  18982. }
  18983. Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
  18984. Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
  18985. Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
  18986. #undef Vc_BINARY_OPERATOR_
  18987. Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
  18988. {
  18989. return {private_init, isnegative(data)};
  18990. }
  18991. private:
  18992. friend reference;
  18993. Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
  18994. {
  18995. return o.data[i];
  18996. }
  18997. template <typename U>
  18998. Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
  18999. noexcept(std::declval<value_type &>() = v))
  19000. {
  19001. o.data[i] = v;
  19002. }
  19003. public:
  19004. Vc_INTRINSIC reference operator[](size_t i) noexcept
  19005. {
  19006. static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
  19007. return {*this, int(i)};
  19008. }
  19009. Vc_INTRINSIC value_type operator[](size_t i) const noexcept
  19010. {
  19011. return get(*this, int(i));
  19012. }
  19013. Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(const mask_type &k)
  19014. {
  19015. return {*this, k};
  19016. }
  19017. Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
  19018. {
  19019. data.assign(v.data, internal_data(k));
  19020. }
  19021. #define Vc_REDUCTION_FUNCTION_(name_) \
  19022. Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \
  19023. Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \
  19024. { \
  19025. return data.name_(internal_data(mask)); \
  19026. } \
  19027. Vc_NOTHING_EXPECTING_SEMICOLON
  19028. Vc_REDUCTION_FUNCTION_(min);
  19029. Vc_REDUCTION_FUNCTION_(max);
  19030. Vc_REDUCTION_FUNCTION_(product);
  19031. Vc_REDUCTION_FUNCTION_(sum);
  19032. #undef Vc_REDUCTION_FUNCTION_
  19033. Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
  19034. {
  19035. return {private_init, data.partialSum()};
  19036. }
  19037. template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f) const
  19038. {
  19039. return {private_init, data.apply(std::forward<F>(f))};
  19040. }
  19041. template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
  19042. {
  19043. return {private_init, data.apply(std::forward<F>(f), k)};
  19044. }
  19045. Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount) const
  19046. {
  19047. return {private_init, data.shifted(amount)};
  19048. }
  19049. template <std::size_t NN>
  19050. Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount, const SimdArray<value_type, NN> &shiftIn)
  19051. const
  19052. {
  19053. return {private_init, data.shifted(amount, simd_cast<VectorType>(shiftIn))};
  19054. }
  19055. Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
  19056. {
  19057. return {private_init, data.rotated(amount)};
  19058. }
  19059. Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
  19060. {
  19061. return {private_init, exponent(data)};
  19062. }
  19063. Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(SimdArray x) const
  19064. {
  19065. return {private_init, data.interleaveLow(x.data)};
  19066. }
  19067. Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(SimdArray x) const
  19068. {
  19069. return {private_init, data.interleaveHigh(x.data)};
  19070. }
  19071. Vc_INTRINSIC fixed_size_simd<T, N> reversed() const
  19072. {
  19073. return {private_init, data.reversed()};
  19074. }
  19075. Vc_INTRINSIC fixed_size_simd<T, N> sorted() const
  19076. {
  19077. return {private_init, data.sorted()};
  19078. }
  19079. template <class G, class = decltype(std::declval<G>()(std::size_t())),
  19080. class = enable_if<!Traits::is_simd_vector<G>::value>>
  19081. Vc_INTRINSIC SimdArray(const G &gen) : data(gen)
  19082. {
  19083. }
  19084. template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
  19085. {
  19086. return {private_init, VectorType::generate(gen)};
  19087. }
  19088. Vc_DEPRECATED("use copysign(x, y) instead")
  19089. Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
  19090. {
  19091. return {private_init, Vc::copysign(data, x.data)};
  19092. }
  19093. friend VectorType &internal_data<>(SimdArray &x);
  19094. friend const VectorType &internal_data<>(const SimdArray &x);
  19095. Vc_INTRINSIC SimdArray(private_init_t, VectorType &&x) : data(std::move(x)) {}
  19096. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type));
  19097. private:
  19098. alignas(static_cast<std::size_t>(
  19099. Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
  19100. VectorType_::size()>::value)) storage_type data;
  19101. };
  19102. template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdArray<T, N, VectorType, N>::Size;
  19103. template <typename T, std::size_t N, typename VectorType>
  19104. constexpr std::size_t SimdArray<T, N, VectorType, N>::MemoryAlignment;
  19105. template <typename T, std::size_t N, typename VectorType>
  19106. #ifndef Vc_MSVC
  19107. Vc_INTRINSIC
  19108. #endif
  19109. VectorType &internal_data(SimdArray<T, N, VectorType, N> &x)
  19110. {
  19111. return x.data;
  19112. }
  19113. template <typename T, std::size_t N, typename VectorType>
  19114. #ifndef Vc_MSVC
  19115. Vc_INTRINSIC
  19116. #endif
  19117. const VectorType &internal_data(const SimdArray<T, N, VectorType, N> &x)
  19118. {
  19119. return x.data;
  19120. }
  19121. template <class T> Vc_INTRINSIC T unwrap(const T &x) { return x; }
  19122. template <class T, size_t N, class V>
  19123. Vc_INTRINSIC V unwrap(const SimdArray<T, N, V, N> &x)
  19124. {
  19125. return internal_data(x);
  19126. }
  19127. template <class T, size_t Pieces, size_t Index>
  19128. Vc_INTRINSIC auto unwrap(const Common::Segment<T, Pieces, Index> &x)
  19129. -> decltype(x.to_fixed_size())
  19130. {
  19131. return unwrap(x.to_fixed_size());
  19132. }
  19133. template <typename T, std::size_t N, typename VectorType>
  19134. template <class MT, class IT, int Scale>
  19135. Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
  19136. const Common::GatherArguments<MT, IT, Scale> &args)
  19137. {
  19138. data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)));
  19139. }
  19140. template <typename T, std::size_t N, typename VectorType>
  19141. template <class MT, class IT, int Scale>
  19142. Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
  19143. const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
  19144. {
  19145. data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)),
  19146. mask);
  19147. }
  19148. template <typename T, std::size_t N, typename VectorType>
  19149. template <typename MT, typename IT>
  19150. inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
  19151. IT &&indexes) const
  19152. {
  19153. data.scatter(mem, unwrap(std::forward<IT>(indexes)));
  19154. }
  19155. template <typename T, std::size_t N, typename VectorType>
  19156. template <typename MT, typename IT>
  19157. inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
  19158. IT &&indexes,
  19159. MaskArgument mask) const
  19160. {
  19161. data.scatter(mem, unwrap(std::forward<IT>(indexes)), mask);
  19162. }
  19163. template <typename T, size_t N, typename V, size_t Wt> class SimdArray
  19164. {
  19165. static_assert(std::is_same<T, double>::value ||
  19166. std::is_same<T, float>::value ||
  19167. std::is_same<T, int32_t>::value ||
  19168. std::is_same<T, uint32_t>::value ||
  19169. std::is_same<T, int16_t>::value ||
  19170. std::is_same<T, uint16_t>::value, "SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }");
  19171. static_assert(
  19172. std::is_same<V, typename Common::select_best_vector_type<T, N>::type>::value &&
  19173. V::size() == Wt,
  19174. "ERROR: leave the third and fourth template parameters with their defaults. They "
  19175. "are implementation details.");
  19176. static_assert(
  19177. std::is_same<typename V::EntryType, typename V::VectorEntryType>::value ||
  19178. (N % V::size() == 0),
  19179. "SimdArray<(un)signed short, N> on MIC only works correctly for N = k * "
  19180. "MIC::(u)short_v::size(), i.e. k * 16.");
  19181. using my_traits = SimdArrayTraits<T, N>;
  19182. static constexpr std::size_t N0 = my_traits::N0;
  19183. static constexpr std::size_t N1 = my_traits::N1;
  19184. using Split = Common::Split<N0>;
  19185. template <typename U, std::size_t K> using CArray = U[K];
  19186. public:
  19187. static constexpr bool is_atomic = false;
  19188. using storage_type0 = typename my_traits::storage_type0;
  19189. using storage_type1 = typename my_traits::storage_type1;
  19190. static_assert(storage_type0::size() == N0, "");
  19191. using vector_type = V;
  19192. using vectorentry_type = typename storage_type0::vectorentry_type;
  19193. typedef vectorentry_type alias_type Vc_MAY_ALIAS;
  19194. using value_type = T;
  19195. using mask_type = fixed_size_simd_mask<T, N>;
  19196. using index_type = fixed_size_simd<int, N>;
  19197. static constexpr std::size_t size() { return N; }
  19198. using Mask = mask_type;
  19199. using MaskType = Mask;
  19200. using MaskArgument = const MaskType &;
  19201. using VectorEntryType = vectorentry_type;
  19202. using EntryType = value_type;
  19203. using IndexType = index_type;
  19204. using AsArg = const SimdArray &;
  19205. using reference = Detail::ElementReference<SimdArray>;
  19206. static constexpr std::size_t MemoryAlignment =
  19207. storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
  19208. ? storage_type0::MemoryAlignment
  19209. : storage_type1::MemoryAlignment;
  19210. static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
  19211. {
  19212. return SimdArray(Vc::Zero);
  19213. }
  19214. static Vc_INTRINSIC fixed_size_simd<T, N> One()
  19215. {
  19216. return SimdArray(Vc::One);
  19217. }
  19218. static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
  19219. {
  19220. return SimdArray(Vc::IndexesFromZero);
  19221. }
  19222. static Vc_INTRINSIC fixed_size_simd<T, N> Random()
  19223. {
  19224. return fromOperation(Common::Operations::random());
  19225. }
  19226. template <class G, class = decltype(std::declval<G>()(std::size_t())),
  19227. class = enable_if<!Traits::is_simd_vector<G>::value>>
  19228. Vc_INTRINSIC SimdArray(const G &gen)
  19229. : data0(gen), data1([&](std::size_t i) { return gen(i + storage_type0::size()); })
  19230. {
  19231. }
  19232. template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
  19233. {
  19234. auto tmp = storage_type0::generate(gen);
  19235. return {std::move(tmp),
  19236. storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
  19237. }
  19238. SimdArray() = default;
  19239. Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {}
  19240. template <
  19241. typename U,
  19242. typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
  19243. SimdArray(U a)
  19244. : SimdArray(static_cast<value_type>(a))
  19245. {
  19246. }
  19247. SimdArray(const SimdArray &) = default;
  19248. SimdArray(SimdArray &&) = default;
  19249. SimdArray &operator=(const SimdArray &) = default;
  19250. template <typename U, typename Flags = DefaultLoadTag,
  19251. typename = enable_if<std::is_arithmetic<U>::value &&
  19252. Traits::is_load_store_flag<Flags>::value>>
  19253. explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags())
  19254. : data0(mem, f), data1(mem + storage_type0::size(), f)
  19255. {
  19256. }
  19257. #ifndef Vc_MSVC
  19258. template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
  19259. typename = enable_if<std::is_arithmetic<U>::value &&
  19260. Traits::is_load_store_flag<Flags>::value>>
  19261. explicit Vc_INTRINSIC SimdArray(CArray<U, Extent> &mem, Flags f = Flags())
  19262. : data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
  19263. {
  19264. }
  19265. template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
  19266. typename = enable_if<std::is_arithmetic<U>::value &&
  19267. Traits::is_load_store_flag<Flags>::value>>
  19268. explicit Vc_INTRINSIC SimdArray(const CArray<U, Extent> &mem, Flags f = Flags())
  19269. : data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
  19270. {
  19271. }
  19272. #endif
  19273. Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
  19274. : data0(init.begin(), Vc::Unaligned)
  19275. , data1(init.begin() + storage_type0::size(), Vc::Unaligned)
  19276. {
  19277. Vc_ASSERT(init.size() == size());
  19278. }
  19279. #ifndef Vc_CURRENT_CLASS_NAME
  19280. #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
  19281. #endif
  19282. private:
  19283. template <class MT, class IT, int Scale = 1>
  19284. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
  19285. template <class MT, class IT, int Scale = 1>
  19286. inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
  19287. MaskArgument mask);
  19288. public:
  19289. #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
  19290. static_assert( \
  19291. std::is_convertible<MT, EntryType>::value, \
  19292. "The memory pointer needs to point to a type that can be converted to the " \
  19293. "EntryType of this SIMD vector type."); \
  19294. static_assert( \
  19295. Vc::Traits::has_subscript_operator<IT>::value, \
  19296. "The indexes argument must be a type that implements the subscript operator."); \
  19297. static_assert( \
  19298. !Traits::is_simd_vector<IT>::value || \
  19299. Traits::simd_vector_size<IT>::value >= Size, \
  19300. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  19301. "have at least as many entries as this SIMD vector."); \
  19302. static_assert( \
  19303. !std::is_array<T>::value || \
  19304. (std::rank<T>::value == 1 && \
  19305. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  19306. "If you use a simple array for the indexes parameter, the array must have " \
  19307. "at least as many entries as this SIMD vector.")
  19308. template <typename MT, typename IT,
  19309. typename = enable_if<Traits::has_subscript_operator<IT>::value>>
  19310. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
  19311. {
  19312. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19313. gatherImplementation(
  19314. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  19315. }
  19316. template <class MT, class IT, int Scale>
  19317. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
  19318. {
  19319. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19320. gatherImplementation(args);
  19321. }
  19322. template <typename MT, typename IT,
  19323. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  19324. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
  19325. MaskArgument mask)
  19326. {
  19327. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19328. gatherImplementation(
  19329. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  19330. }
  19331. template <class MT, class IT, int Scale>
  19332. Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
  19333. MaskArgument mask)
  19334. {
  19335. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19336. gatherImplementation(args, mask);
  19337. }
  19338. template <typename MT, typename IT,
  19339. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  19340. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
  19341. {
  19342. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19343. gatherImplementation(
  19344. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
  19345. }
  19346. template <typename MT, typename IT,
  19347. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  19348. Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
  19349. {
  19350. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19351. gatherImplementation(
  19352. Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
  19353. }
  19354. template <class MT, class IT, int Scale>
  19355. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
  19356. {
  19357. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19358. gatherImplementation(args);
  19359. }
  19360. template <class MT, class IT, int Scale>
  19361. Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
  19362. MaskArgument mask)
  19363. {
  19364. Vc_ASSERT_GATHER_PARAMETER_TYPES_;
  19365. gatherImplementation(args, mask);
  19366. }
  19367. #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
  19368. private:
  19369. template <typename MT, typename IT>
  19370. inline void scatterImplementation(MT *mem, IT &&indexes) const;
  19371. template <typename MT, typename IT>
  19372. inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
  19373. public:
  19374. #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
  19375. static_assert( \
  19376. std::is_convertible<EntryType, MT>::value, \
  19377. "The memory pointer needs to point to a type that the EntryType of this " \
  19378. "SIMD vector type can be converted to."); \
  19379. static_assert( \
  19380. Vc::Traits::has_subscript_operator<IT>::value, \
  19381. "The indexes argument must be a type that implements the subscript operator."); \
  19382. static_assert( \
  19383. !Traits::is_simd_vector<IT>::value || \
  19384. Traits::simd_vector_size<IT>::value >= Size, \
  19385. "If you use a SIMD vector for the indexes parameter, the index vector must " \
  19386. "have at least as many entries as this SIMD vector."); \
  19387. static_assert( \
  19388. !std::is_array<T>::value || \
  19389. (std::rank<T>::value == 1 && \
  19390. (std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
  19391. "If you use a simple array for the indexes parameter, the array must have " \
  19392. "at least as many entries as this SIMD vector.")
  19393. template <typename MT,
  19394. typename IT,
  19395. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  19396. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
  19397. {
  19398. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  19399. scatterImplementation(mem, std::forward<IT>(indexes));
  19400. }
  19401. template <typename MT,
  19402. typename IT,
  19403. typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
  19404. Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
  19405. {
  19406. Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
  19407. scatterImplementation(mem, std::forward<IT>(indexes), mask);
  19408. }
  19409. template <typename MT, typename IT>
  19410. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
  19411. {
  19412. scatter(args.address, args.indexes);
  19413. }
  19414. template <typename MT, typename IT>
  19415. Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
  19416. {
  19417. scatter(args.address, args.indexes, mask);
  19418. }
  19419. #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
  19420. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data0(), data1() {}
  19421. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data0(o), data1(o) {}
  19422. explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i)
  19423. : data0(i)
  19424. , data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
  19425. storage_type0::size()>())
  19426. {
  19427. }
  19428. template <size_t Offset>
  19429. explicit Vc_INTRINSIC SimdArray(
  19430. Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset> i)
  19431. : data0(i)
  19432. , data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
  19433. storage_type0::size() + Offset>())
  19434. {
  19435. }
  19436. template <class W, class = enable_if<
  19437. (Traits::is_simd_vector<W>::value &&
  19438. Traits::simd_vector_size<W>::value == N &&
  19439. !(std::is_convertible<Traits::entry_type_of<W>, T>::value &&
  19440. Traits::isSimdArray<W>::value))>>
  19441. Vc_INTRINSIC explicit SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
  19442. {
  19443. }
  19444. template <class W, class = enable_if<
  19445. (Traits::isSimdArray<W>::value &&
  19446. Traits::simd_vector_size<W>::value == N &&
  19447. std::is_convertible<Traits::entry_type_of<W>, T>::value)>,
  19448. class = W>
  19449. Vc_INTRINSIC SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
  19450. {
  19451. }
  19452. template <class W, std::size_t Pieces, std::size_t Index>
  19453. Vc_INTRINSIC SimdArray(Common::Segment<W, Pieces, Index> &&x)
  19454. : data0(Common::Segment<W, 2 * Pieces, 2 * Index>{x.data})
  19455. , data1(Common::Segment<W, 2 * Pieces, 2 * Index + 1>{x.data})
  19456. {
  19457. }
  19458. template <typename U, typename A,
  19459. typename =
  19460. enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
  19461. !std::is_same<A, simd_abi::fixed_size<N>>::value>>
  19462. operator Vector<U, A>() const
  19463. {
  19464. auto r = simd_cast<Vector<U, A>>(data0, data1);
  19465. return r;
  19466. }
  19467. Vc_INTRINSIC operator fixed_size_simd<T, N> &()
  19468. {
  19469. return static_cast<fixed_size_simd<T, N> &>(*this);
  19470. }
  19471. Vc_INTRINSIC operator const fixed_size_simd<T, N> &() const
  19472. {
  19473. return static_cast<const fixed_size_simd<T, N> &>(*this);
  19474. }
  19475. Vc_INTRINSIC void setZero()
  19476. {
  19477. data0.setZero();
  19478. data1.setZero();
  19479. }
  19480. Vc_INTRINSIC void setZero(const mask_type &k)
  19481. {
  19482. data0.setZero(Split::lo(k));
  19483. data1.setZero(Split::hi(k));
  19484. }
  19485. Vc_INTRINSIC void setZeroInverted()
  19486. {
  19487. data0.setZeroInverted();
  19488. data1.setZeroInverted();
  19489. }
  19490. Vc_INTRINSIC void setZeroInverted(const mask_type &k)
  19491. {
  19492. data0.setZeroInverted(Split::lo(k));
  19493. data1.setZeroInverted(Split::hi(k));
  19494. }
  19495. Vc_INTRINSIC void setQnan() {
  19496. data0.setQnan();
  19497. data1.setQnan();
  19498. }
  19499. Vc_INTRINSIC void setQnan(const mask_type &m) {
  19500. data0.setQnan(Split::lo(m));
  19501. data1.setQnan(Split::hi(m));
  19502. }
  19503. template <typename Op, typename... Args>
  19504. static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
  19505. {
  19506. fixed_size_simd<T, N> r = {
  19507. storage_type0::fromOperation(op, Split::lo(args)...),
  19508. storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
  19509. return r;
  19510. }
  19511. template <typename Op, typename... Args>
  19512. static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
  19513. {
  19514. storage_type0::callOperation(op, Split::lo(args)...);
  19515. storage_type1::callOperation(op, Split::hi(std::forward<Args>(args))...);
  19516. }
  19517. template <typename U, typename... Args> Vc_INTRINSIC void load(const U *mem, Args &&... args)
  19518. {
  19519. data0.load(mem, Split::lo(args)...);
  19520. data1.load(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
  19521. }
  19522. template <typename U, typename... Args> Vc_INTRINSIC void store(U *mem, Args &&... args) const
  19523. {
  19524. data0.store(mem, Split::lo(args)...);
  19525. data1.store(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
  19526. }
  19527. Vc_INTRINSIC mask_type operator!() const
  19528. {
  19529. return {!data0, !data1};
  19530. }
  19531. Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
  19532. {
  19533. return {-data0, -data1};
  19534. }
  19535. Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
  19536. Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
  19537. {
  19538. return {~data0, ~data1};
  19539. }
  19540. template <typename U,
  19541. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  19542. Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
  19543. {
  19544. return {data0 << x, data1 << x};
  19545. }
  19546. template <typename U,
  19547. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  19548. Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
  19549. {
  19550. data0 <<= x;
  19551. data1 <<= x;
  19552. return *this;
  19553. }
  19554. template <typename U,
  19555. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  19556. Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
  19557. {
  19558. return {data0 >> x, data1 >> x};
  19559. }
  19560. template <typename U,
  19561. typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
  19562. Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
  19563. {
  19564. data0 >>= x;
  19565. data1 >>= x;
  19566. return *this;
  19567. }
  19568. #define Vc_BINARY_OPERATOR_(op) \
  19569. Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
  19570. { \
  19571. data0 op## = rhs.data0; \
  19572. data1 op## = rhs.data1; \
  19573. return *this; \
  19574. }
  19575. Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
  19576. Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
  19577. Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
  19578. #undef Vc_BINARY_OPERATOR_
  19579. private:
  19580. friend reference;
  19581. Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
  19582. {
  19583. return reinterpret_cast<const alias_type *>(&o)[i];
  19584. }
  19585. template <typename U>
  19586. Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
  19587. noexcept(std::declval<value_type &>() = v))
  19588. {
  19589. reinterpret_cast<alias_type *>(&o)[i] = v;
  19590. }
  19591. public:
  19592. Vc_INTRINSIC reference operator[](size_t i) noexcept
  19593. {
  19594. static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
  19595. return {*this, int(i)};
  19596. }
  19597. Vc_INTRINSIC value_type operator[](size_t index) const noexcept
  19598. {
  19599. return get(*this, int(index));
  19600. }
  19601. Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(
  19602. const mask_type &mask)
  19603. {
  19604. return {*this, mask};
  19605. }
  19606. Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
  19607. {
  19608. data0.assign(v.data0, internal_data0(k));
  19609. data1.assign(v.data1, internal_data1(k));
  19610. }
  19611. #define Vc_REDUCTION_FUNCTION_(name_,binary_fun_,scalar_fun_) \
  19612. private: \
  19613. template <typename ForSfinae = void> \
  19614. Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
  19615. storage_type0::Size == storage_type1::Size, \
  19616. value_type> name_##_impl() const \
  19617. { \
  19618. return binary_fun_(data0, data1).name_(); \
  19619. } \
  19620. \
  19621. template <typename ForSfinae = void> \
  19622. Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
  19623. storage_type0::Size != storage_type1::Size, \
  19624. value_type> name_##_impl() const \
  19625. { \
  19626. return scalar_fun_(data0.name_(), data1.name_()); \
  19627. } \
  19628. \
  19629. public: \
  19630. \
  19631. Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \
  19632. \
  19633. Vc_INTRINSIC value_type name_(const mask_type &mask) const \
  19634. { \
  19635. if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \
  19636. return data1.name_(Split::hi(mask)); \
  19637. } else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \
  19638. return data0.name_(Split::lo(mask)); \
  19639. } else { \
  19640. return scalar_fun_(data0.name_(Split::lo(mask)), \
  19641. data1.name_(Split::hi(mask))); \
  19642. } \
  19643. } \
  19644. Vc_NOTHING_EXPECTING_SEMICOLON
  19645. Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min);
  19646. Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max);
  19647. Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_);
  19648. Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_);
  19649. #undef Vc_REDUCTION_FUNCTION_
  19650. Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
  19651. {
  19652. auto ps0 = data0.partialSum();
  19653. auto tmp = data1;
  19654. tmp[0] += ps0[data0.size() - 1];
  19655. return {std::move(ps0), tmp.partialSum()};
  19656. }
  19657. template <typename F> inline fixed_size_simd<T, N> apply(F &&f) const
  19658. {
  19659. return {data0.apply(f), data1.apply(f)};
  19660. }
  19661. template <typename F>
  19662. inline fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
  19663. {
  19664. return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))};
  19665. }
  19666. inline fixed_size_simd<T, N> shifted(int amount) const
  19667. {
  19668. constexpr int SSize = Size;
  19669. constexpr int SSize0 = storage_type0::Size;
  19670. constexpr int SSize1 = storage_type1::Size;
  19671. if (amount == 0) {
  19672. return *this;
  19673. }
  19674. if (amount < 0) {
  19675. if (amount > -SSize0) {
  19676. return {data0.shifted(amount), data1.shifted(amount, data0)};
  19677. }
  19678. if (amount == -SSize0) {
  19679. return {storage_type0(0), simd_cast<storage_type1>(data0)};
  19680. }
  19681. if (amount < -SSize0) {
  19682. return {storage_type0(0), simd_cast<storage_type1>(data0.shifted(
  19683. amount + SSize0))};
  19684. }
  19685. return Zero();
  19686. } else {
  19687. if (amount >= SSize) {
  19688. return Zero();
  19689. } else if (amount >= SSize0) {
  19690. return {
  19691. simd_cast<storage_type0>(data1).shifted(amount - SSize0),
  19692. storage_type1(0)};
  19693. } else if (amount >= SSize1) {
  19694. return {data0.shifted(amount, data1), storage_type1(0)};
  19695. } else {
  19696. return {data0.shifted(amount, data1), data1.shifted(amount)};
  19697. }
  19698. }
  19699. }
  19700. template <std::size_t NN>
  19701. inline enable_if<
  19702. !(std::is_same<storage_type0, storage_type1>::value &&
  19703. N == NN),
  19704. fixed_size_simd<T, N>>
  19705. shifted(int amount, const SimdArray<value_type, NN> &shiftIn) const
  19706. {
  19707. constexpr int SSize = Size;
  19708. if (amount < 0) {
  19709. return fixed_size_simd<T, N>([&](int i) -> value_type {
  19710. i += amount;
  19711. if (i >= 0) {
  19712. return operator[](i);
  19713. } else if (i >= -SSize) {
  19714. return shiftIn[i + SSize];
  19715. }
  19716. return 0;
  19717. });
  19718. }
  19719. return fixed_size_simd<T, N>([&](int i) -> value_type {
  19720. i += amount;
  19721. if (i < SSize) {
  19722. return operator[](i);
  19723. } else if (i < 2 * SSize) {
  19724. return shiftIn[i - SSize];
  19725. }
  19726. return 0;
  19727. });
  19728. }
  19729. private:
  19730. template <std::size_t NN> struct bisectable_shift
  19731. : public std::integral_constant<bool,
  19732. std::is_same<storage_type0, storage_type1>::value &&
  19733. N == NN>
  19734. {
  19735. };
  19736. public:
  19737. template <std::size_t NN>
  19738. inline fixed_size_simd<T, N> shifted(
  19739. enable_if<bisectable_shift<NN>::value, int> amount,
  19740. const SimdArray<value_type, NN> &shiftIn) const
  19741. {
  19742. constexpr int SSize = Size;
  19743. if (amount < 0) {
  19744. if (amount > -static_cast<int>(storage_type0::Size)) {
  19745. return {data0.shifted(amount, internal_data1(shiftIn)),
  19746. data1.shifted(amount, data0)};
  19747. }
  19748. if (amount == -static_cast<int>(storage_type0::Size)) {
  19749. return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)};
  19750. }
  19751. if (amount > -SSize) {
  19752. return {
  19753. internal_data1(shiftIn)
  19754. .shifted(amount + static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
  19755. data0.shifted(amount + static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
  19756. }
  19757. if (amount == -SSize) {
  19758. return shiftIn;
  19759. }
  19760. if (amount > -2 * SSize) {
  19761. return shiftIn.shifted(amount + SSize);
  19762. }
  19763. }
  19764. if (amount == 0) {
  19765. return *this;
  19766. }
  19767. if (amount < static_cast<int>(storage_type0::Size)) {
  19768. return {data0.shifted(amount, data1),
  19769. data1.shifted(amount, internal_data0(shiftIn))};
  19770. }
  19771. if (amount == static_cast<int>(storage_type0::Size)) {
  19772. return {storage_type0(data1), storage_type1(internal_data0(shiftIn))};
  19773. }
  19774. if (amount < SSize) {
  19775. return {data1.shifted(amount - static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
  19776. internal_data0(shiftIn)
  19777. .shifted(amount - static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
  19778. }
  19779. if (amount == SSize) {
  19780. return shiftIn;
  19781. }
  19782. if (amount < 2 * SSize) {
  19783. return shiftIn.shifted(amount - SSize);
  19784. }
  19785. return Zero();
  19786. }
  19787. Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
  19788. {
  19789. amount %= int(size());
  19790. if (amount == 0) {
  19791. return *this;
  19792. } else if (amount < 0) {
  19793. amount += size();
  19794. }
  19795. #ifdef Vc_MSVC
  19796. alignas(MemoryAlignment) T tmp[N + data0.size()];
  19797. data0.store(&tmp[0], Vc::Aligned);
  19798. data1.store(&tmp[data0.size()], Vc::Aligned);
  19799. data0.store(&tmp[N], Vc::Unaligned);
  19800. fixed_size_simd<T, N> r;
  19801. r.data0.load(&tmp[amount], Vc::Unaligned);
  19802. r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned);
  19803. return r;
  19804. #else
  19805. auto &&d0cvtd = simd_cast<storage_type1>(data0);
  19806. auto &&d1cvtd = simd_cast<storage_type0>(data1);
  19807. constexpr int size0 = storage_type0::size();
  19808. constexpr int size1 = storage_type1::size();
  19809. if (amount == size0 && std::is_same<storage_type0, storage_type1>::value) {
  19810. return {std::move(d1cvtd), std::move(d0cvtd)};
  19811. } else if (amount < size1) {
  19812. return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)};
  19813. } else if (amount == size1) {
  19814. return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)};
  19815. } else if (int(size()) - amount < size1) {
  19816. return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)),
  19817. data1.shifted(amount - int(size()), data0.shifted(size0 - size1))};
  19818. } else if (int(size()) - amount == size1) {
  19819. return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)),
  19820. simd_cast<storage_type1>(data0.shifted(size0 - size1))};
  19821. } else if (amount <= size0) {
  19822. return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
  19823. simd_cast<storage_type1>(data0.shifted(amount - size1))};
  19824. } else {
  19825. return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
  19826. simd_cast<storage_type1>(data0.shifted(amount - size1, d1cvtd))};
  19827. }
  19828. return *this;
  19829. #endif
  19830. }
  19831. Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(const SimdArray &x) const
  19832. {
  19833. return {data0.interleaveLow(x.data0),
  19834. simd_cast<storage_type1>(data0.interleaveHigh(x.data0))};
  19835. }
  19836. Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(const SimdArray &x) const
  19837. {
  19838. return interleaveHighImpl(
  19839. x,
  19840. std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
  19841. }
  19842. private:
  19843. Vc_INTRINSIC fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::true_type) const
  19844. {
  19845. return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)};
  19846. }
  19847. inline fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::false_type) const
  19848. {
  19849. return {data0.interleaveHigh(x.data0)
  19850. .shifted(storage_type1::Size,
  19851. simd_cast<storage_type0>(data1.interleaveLow(x.data1))),
  19852. data1.interleaveHigh(x.data1)};
  19853. }
  19854. public:
  19855. inline fixed_size_simd<T, N> reversed() const
  19856. {
  19857. if (std::is_same<storage_type0, storage_type1>::value) {
  19858. return {simd_cast<storage_type0>(data1).reversed(),
  19859. simd_cast<storage_type1>(data0).reversed()};
  19860. } else {
  19861. #ifdef Vc_MSVC
  19862. alignas(MemoryAlignment) T tmp[N];
  19863. data1.reversed().store(&tmp[0], Vc::Aligned);
  19864. data0.reversed().store(&tmp[data1.size()], Vc::Unaligned);
  19865. return fixed_size_simd<T, N>{&tmp[0], Vc::Aligned};
  19866. #else
  19867. return {data0.shifted(storage_type1::Size, data1).reversed(),
  19868. simd_cast<storage_type1>(data0.reversed().shifted(
  19869. storage_type0::Size - storage_type1::Size))};
  19870. #endif
  19871. }
  19872. }
  19873. inline fixed_size_simd<T, N> sorted() const
  19874. {
  19875. return sortedImpl(
  19876. std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
  19877. }
  19878. Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::true_type) const
  19879. {
  19880. #ifdef Vc_DEBUG_SORTED
  19881. std::cerr << "-- " << data0 << data1 << '\n';
  19882. #endif
  19883. const auto a = data0.sorted();
  19884. const auto b = data1.sorted().reversed();
  19885. const auto lo = Vc::min(a, b);
  19886. const auto hi = Vc::max(a, b);
  19887. return {lo.sorted(), hi.sorted()};
  19888. }
  19889. Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::false_type) const
  19890. {
  19891. using SortableArray =
  19892. fixed_size_simd<value_type, Common::NextPowerOfTwo<size()>::value>;
  19893. auto sortable = simd_cast<SortableArray>(*this);
  19894. for (std::size_t i = Size; i < SortableArray::Size; ++i) {
  19895. using limits = std::numeric_limits<value_type>;
  19896. if (limits::has_infinity) {
  19897. sortable[i] = limits::infinity();
  19898. } else {
  19899. sortable[i] = std::numeric_limits<value_type>::max();
  19900. }
  19901. }
  19902. return simd_cast<fixed_size_simd<T, N>>(sortable.sorted());
  19903. }
  19904. static constexpr std::size_t Size = size();
  19905. Vc_DEPRECATED("use exponent(x) instead")
  19906. Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
  19907. {
  19908. return {exponent(data0), exponent(data1)};
  19909. }
  19910. Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
  19911. {
  19912. return {isnegative(data0), isnegative(data1)};
  19913. }
  19914. Vc_DEPRECATED("use copysign(x, y) instead")
  19915. Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
  19916. {
  19917. return {Vc::copysign(data0, x.data0),
  19918. Vc::copysign(data1, x.data1)};
  19919. }
  19920. friend storage_type0 &internal_data0<>(SimdArray &x);
  19921. friend storage_type1 &internal_data1<>(SimdArray &x);
  19922. friend const storage_type0 &internal_data0<>(const SimdArray &x);
  19923. friend const storage_type1 &internal_data1<>(const SimdArray &x);
  19924. Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y)
  19925. : data0(std::move(x)), data1(std::move(y))
  19926. {
  19927. }
  19928. Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0));
  19929. private:
  19930. alignas(static_cast<std::size_t>(
  19931. Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
  19932. V::size()>::value)) storage_type0 data0;
  19933. storage_type1 data1;
  19934. };
  19935. #undef Vc_CURRENT_CLASS_NAME
  19936. template <typename T, std::size_t N, typename V, std::size_t M>
  19937. constexpr std::size_t SimdArray<T, N, V, M>::Size;
  19938. template <typename T, std::size_t N, typename V, std::size_t M>
  19939. constexpr std::size_t SimdArray<T, N, V, M>::MemoryAlignment;
  19940. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  19941. template <class MT, class IT, int Scale>
  19942. inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
  19943. const Common::GatherArguments<MT, IT, Scale> &args)
  19944. {
  19945. data0.gather(Common::make_gather<Scale>(
  19946. args.address, Split::lo(Common::Operations::gather(), args.indexes)));
  19947. data1.gather(Common::make_gather<Scale>(
  19948. args.address, Split::hi(Common::Operations::gather(), args.indexes)));
  19949. }
  19950. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  19951. template <class MT, class IT, int Scale>
  19952. inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
  19953. const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
  19954. {
  19955. data0.gather(Common::make_gather<Scale>(
  19956. args.address, Split::lo(Common::Operations::gather(), args.indexes)),
  19957. Split::lo(mask));
  19958. data1.gather(Common::make_gather<Scale>(
  19959. args.address, Split::hi(Common::Operations::gather(), args.indexes)),
  19960. Split::hi(mask));
  19961. }
  19962. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  19963. template <typename MT, typename IT>
  19964. inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
  19965. IT &&indexes) const
  19966. {
  19967. data0.scatter(mem, Split::lo(Common::Operations::gather(),
  19968. indexes));
  19969. data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)));
  19970. }
  19971. template <typename T, std::size_t N, typename VectorType, std::size_t M>
  19972. template <typename MT, typename IT>
  19973. inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
  19974. IT &&indexes, MaskArgument mask) const
  19975. {
  19976. data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes),
  19977. Split::lo(mask));
  19978. data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)),
  19979. Split::hi(mask));
  19980. }
  19981. template <typename T, std::size_t N, typename V, std::size_t M>
  19982. #ifndef Vc_MSVC
  19983. Vc_INTRINSIC
  19984. #endif
  19985. typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
  19986. SimdArray<T, N, V, M> &x)
  19987. {
  19988. return x.data0;
  19989. }
  19990. template <typename T, std::size_t N, typename V, std::size_t M>
  19991. #ifndef Vc_MSVC
  19992. Vc_INTRINSIC
  19993. #endif
  19994. typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
  19995. SimdArray<T, N, V, M> &x)
  19996. {
  19997. return x.data1;
  19998. }
  19999. template <typename T, std::size_t N, typename V, std::size_t M>
  20000. #ifndef Vc_MSVC
  20001. Vc_INTRINSIC
  20002. #endif
  20003. const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
  20004. const SimdArray<T, N, V, M> &x)
  20005. {
  20006. return x.data0;
  20007. }
  20008. template <typename T, std::size_t N, typename V, std::size_t M>
  20009. #ifndef Vc_MSVC
  20010. Vc_INTRINSIC
  20011. #endif
  20012. const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
  20013. const SimdArray<T, N, V, M> &x)
  20014. {
  20015. return x.data1;
  20016. }
  20017. #if defined Vc_MSVC && defined Vc_IMPL_SSE && !defined Vc_IMPL_AVX
  20018. template <>
  20019. Vc_INTRINSIC SimdArray<double, 8>::SimdArray(fixed_size_simd<double, 4> &&x,
  20020. fixed_size_simd<double, 4> &&y)
  20021. : data0(x), data1(0)
  20022. {
  20023. data1 = y;
  20024. }
  20025. #endif
  20026. namespace Detail
  20027. {
  20028. #define Vc_FIXED_OP(op) \
  20029. template <class T, int N, \
  20030. class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
  20031. fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
  20032. const fixed_size_simd<T, N> &b) \
  20033. { \
  20034. return {private_init, internal_data(a) op internal_data(b)}; \
  20035. } \
  20036. template <class T, int N, \
  20037. class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
  20038. class = T> \
  20039. fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
  20040. const fixed_size_simd<T, N> &b) \
  20041. { \
  20042. return {internal_data0(a) op internal_data0(b), \
  20043. internal_data1(a) op internal_data1(b)}; \
  20044. }
  20045. Vc_ALL_ARITHMETICS(Vc_FIXED_OP);
  20046. Vc_ALL_BINARY(Vc_FIXED_OP);
  20047. Vc_ALL_SHIFTS(Vc_FIXED_OP);
  20048. #undef Vc_FIXED_OP
  20049. #define Vc_FIXED_OP(op) \
  20050. template <class T, int N, \
  20051. class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
  20052. fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
  20053. const fixed_size_simd<T, N> &b) \
  20054. { \
  20055. return {private_init, internal_data(a) op internal_data(b)}; \
  20056. } \
  20057. template <class T, int N, \
  20058. class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
  20059. class = T> \
  20060. fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
  20061. const fixed_size_simd<T, N> &b) \
  20062. { \
  20063. return {internal_data0(a) op internal_data0(b), \
  20064. internal_data1(a) op internal_data1(b)}; \
  20065. }
  20066. Vc_ALL_COMPARES(Vc_FIXED_OP);
  20067. #undef Vc_FIXED_OP
  20068. }
  20069. namespace result_vector_type_internal
  20070. {
  20071. template <typename T>
  20072. using remove_cvref = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
  20073. template <typename T>
  20074. using is_integer_larger_than_int = std::integral_constant<
  20075. bool, std::is_integral<T>::value &&(sizeof(T) > sizeof(int) ||
  20076. std::is_same<T, long>::value ||
  20077. std::is_same<T, unsigned long>::value)>;
  20078. template <
  20079. typename L, typename R,
  20080. std::size_t N = Traits::isSimdArray<L>::value ? Traits::simd_vector_size<L>::value
  20081. : Traits::simd_vector_size<R>::value,
  20082. bool = (Traits::isSimdArray<L>::value ||
  20083. Traits::isSimdArray<R>::value) &&
  20084. !(Traits::is_fixed_size_simd<L>::value &&
  20085. Traits::is_fixed_size_simd<R>::value) &&
  20086. ((std::is_arithmetic<remove_cvref<L>>::value &&
  20087. !is_integer_larger_than_int<remove_cvref<L>>::value) ||
  20088. (std::is_arithmetic<remove_cvref<R>>::value &&
  20089. !is_integer_larger_than_int<remove_cvref<R>>::value) ||
  20090. Traits::simd_vector_size<L>::value == Traits::simd_vector_size<R>::value)>
  20091. struct evaluate;
  20092. template <typename L, typename R, std::size_t N> struct evaluate<L, R, N, true>
  20093. {
  20094. private:
  20095. using LScalar = Traits::entry_type_of<L>;
  20096. using RScalar = Traits::entry_type_of<R>;
  20097. template <bool B, typename T, typename F>
  20098. using conditional = typename std::conditional<B, T, F>::type;
  20099. public:
  20100. using type = fixed_size_simd<
  20101. conditional<(std::is_integral<LScalar>::value &&std::is_integral<RScalar>::value &&
  20102. sizeof(LScalar) < sizeof(int) &&
  20103. sizeof(RScalar) < sizeof(int)),
  20104. conditional<(sizeof(LScalar) == sizeof(RScalar)),
  20105. conditional<std::is_unsigned<LScalar>::value, LScalar, RScalar>,
  20106. conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>,
  20107. decltype(std::declval<LScalar>() + std::declval<RScalar>())>,
  20108. N>;
  20109. };
  20110. }
  20111. template <typename L, typename R>
  20112. using result_vector_type = typename result_vector_type_internal::evaluate<L, R>::type;
  20113. #define Vc_BINARY_OPERATORS_(op_) \
  20114. \
  20115. template <typename L, typename R> \
  20116. Vc_INTRINSIC result_vector_type<L, R> operator op_(L &&lhs, R &&rhs) \
  20117. { \
  20118. using Return = result_vector_type<L, R>; \
  20119. return Vc::Detail::operator op_( \
  20120. static_cast<const Return &>(std::forward<L>(lhs)), \
  20121. static_cast<const Return &>(std::forward<R>(rhs))); \
  20122. }
  20123. Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_);
  20124. Vc_ALL_BINARY(Vc_BINARY_OPERATORS_);
  20125. #undef Vc_BINARY_OPERATORS_
  20126. #define Vc_BINARY_OPERATORS_(op_) \
  20127. \
  20128. template <typename L, typename R> \
  20129. Vc_INTRINSIC typename result_vector_type<L, R>::mask_type operator op_(L &&lhs, \
  20130. R &&rhs) \
  20131. { \
  20132. using Promote = result_vector_type<L, R>; \
  20133. return Promote(std::forward<L>(lhs)) op_ Promote(std::forward<R>(rhs)); \
  20134. }
  20135. Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_);
  20136. #undef Vc_BINARY_OPERATORS_
  20137. #define Vc_FORWARD_UNARY_OPERATOR(name_) \
  20138. \
  20139. template <typename T, std::size_t N, typename V, std::size_t M> \
  20140. inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x) \
  20141. { \
  20142. return fixed_size_simd<T, N>::fromOperation( \
  20143. Common::Operations::Forward_##name_(), x); \
  20144. } \
  20145. template <class T, int N> \
  20146. fixed_size_simd<T, N> name_(const fixed_size_simd<T, N> &x) \
  20147. { \
  20148. return fixed_size_simd<T, N>::fromOperation( \
  20149. Common::Operations::Forward_##name_(), x); \
  20150. } \
  20151. Vc_NOTHING_EXPECTING_SEMICOLON
  20152. #define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \
  20153. \
  20154. template <typename T, std::size_t N, typename V, std::size_t M> \
  20155. inline fixed_size_simd_mask<T, N> name_(const SimdArray<T, N, V, M> &x) \
  20156. { \
  20157. return fixed_size_simd_mask<T, N>::fromOperation( \
  20158. Common::Operations::Forward_##name_(), x); \
  20159. } \
  20160. template <class T, int N> \
  20161. fixed_size_simd_mask<T, N> name_(const fixed_size_simd<T, N> &x) \
  20162. { \
  20163. return fixed_size_simd_mask<T, N>::fromOperation( \
  20164. Common::Operations::Forward_##name_(), x); \
  20165. } \
  20166. Vc_NOTHING_EXPECTING_SEMICOLON
  20167. #define Vc_FORWARD_BINARY_OPERATOR(name_) \
  20168. \
  20169. template <typename T, std::size_t N, typename V, std::size_t M> \
  20170. inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x, \
  20171. const SimdArray<T, N, V, M> &y) \
  20172. { \
  20173. return fixed_size_simd<T, N>::fromOperation( \
  20174. Common::Operations::Forward_##name_(), x, y); \
  20175. } \
  20176. Vc_NOTHING_EXPECTING_SEMICOLON
  20177. Vc_FORWARD_UNARY_OPERATOR(abs);
  20178. Vc_FORWARD_UNARY_OPERATOR(asin);
  20179. Vc_FORWARD_UNARY_OPERATOR(atan);
  20180. Vc_FORWARD_BINARY_OPERATOR(atan2);
  20181. Vc_FORWARD_UNARY_OPERATOR(ceil);
  20182. Vc_FORWARD_BINARY_OPERATOR(copysign);
  20183. Vc_FORWARD_UNARY_OPERATOR(cos);
  20184. Vc_FORWARD_UNARY_OPERATOR(exp);
  20185. Vc_FORWARD_UNARY_OPERATOR(exponent);
  20186. Vc_FORWARD_UNARY_OPERATOR(floor);
  20187. template <typename T, std::size_t N>
  20188. inline SimdArray<T, N> fma(const SimdArray<T, N> &a, const SimdArray<T, N> &b,
  20189. const SimdArray<T, N> &c)
  20190. {
  20191. return SimdArray<T, N>::fromOperation(Common::Operations::Forward_fma(), a, b, c);
  20192. }
  20193. Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite);
  20194. Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf);
  20195. Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan);
  20196. Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative);
  20197. template <typename T, std::size_t N>
  20198. inline SimdArray<T, N> frexp(const SimdArray<T, N> &x, SimdArray<int, N> *e)
  20199. {
  20200. return SimdArray<T, N>::fromOperation(Common::Operations::Forward_frexp(), x, e);
  20201. }
  20202. template <typename T, std::size_t N>
  20203. inline SimdArray<T, N> ldexp(const SimdArray<T, N> &x, const SimdArray<int, N> &e)
  20204. {
  20205. return SimdArray<T, N>::fromOperation(Common::Operations::Forward_ldexp(), x, e);
  20206. }
  20207. Vc_FORWARD_UNARY_OPERATOR(log);
  20208. Vc_FORWARD_UNARY_OPERATOR(log10);
  20209. Vc_FORWARD_UNARY_OPERATOR(log2);
  20210. Vc_FORWARD_UNARY_OPERATOR(reciprocal);
  20211. Vc_FORWARD_UNARY_OPERATOR(round);
  20212. Vc_FORWARD_UNARY_OPERATOR(rsqrt);
  20213. Vc_FORWARD_UNARY_OPERATOR(sin);
  20214. template <typename T, std::size_t N>
  20215. void sincos(const SimdArray<T, N> &x, SimdArray<T, N> *sin, SimdArray<T, N> *cos)
  20216. {
  20217. SimdArray<T, N>::callOperation(Common::Operations::Forward_sincos(), x, sin, cos);
  20218. }
  20219. Vc_FORWARD_UNARY_OPERATOR(sqrt);
  20220. Vc_FORWARD_UNARY_OPERATOR(trunc);
  20221. Vc_FORWARD_BINARY_OPERATOR(min);
  20222. Vc_FORWARD_BINARY_OPERATOR(max);
  20223. #undef Vc_FORWARD_UNARY_OPERATOR
  20224. #undef Vc_FORWARD_UNARY_BOOL_OPERATOR
  20225. #undef Vc_FORWARD_BINARY_OPERATOR
  20226. #ifdef Vc_MSVC
  20227. #define Vc_DUMMY_ARG0 , int = 0
  20228. #define Vc_DUMMY_ARG1 , long = 0
  20229. #define Vc_DUMMY_ARG2 , short = 0
  20230. #define Vc_DUMMY_ARG3 , char = '0'
  20231. #define Vc_DUMMY_ARG4 , unsigned = 0u
  20232. #define Vc_DUMMY_ARG5 , unsigned short = 0u
  20233. #else
  20234. #define Vc_DUMMY_ARG0
  20235. #define Vc_DUMMY_ARG1
  20236. #define Vc_DUMMY_ARG2
  20237. #define Vc_DUMMY_ARG3
  20238. #define Vc_DUMMY_ARG4
  20239. #define Vc_DUMMY_ARG5
  20240. #endif
  20241. template <typename Return, std::size_t N, typename T, typename... From>
  20242. Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return>
  20243. simd_cast_impl_smaller_input(const From &... xs, const T &last)
  20244. {
  20245. Return r = simd_cast<Return>(xs...);
  20246. for (size_t i = 0; i < N; ++i) {
  20247. r[i + N * sizeof...(From)] = static_cast<typename Return::EntryType>(last[i]);
  20248. }
  20249. return r;
  20250. }
  20251. template <typename Return, std::size_t N, typename T>
  20252. Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last)
  20253. {
  20254. Return r = Return();
  20255. for (size_t i = 0; i < N; ++i) {
  20256. r[i] = static_cast<typename Return::EntryType>(last[i]);
  20257. }
  20258. return r;
  20259. }
  20260. template <typename Return, std::size_t N, typename T, typename... From>
  20261. Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return> simd_cast_impl_larger_input(
  20262. const From &... xs, const T &last)
  20263. {
  20264. Return r = simd_cast<Return>(xs...);
  20265. for (size_t i = N * sizeof...(From); i < Return::Size; ++i) {
  20266. r[i] = static_cast<typename Return::EntryType>(last[i - N * sizeof...(From)]);
  20267. }
  20268. return r;
  20269. }
  20270. template <typename Return, std::size_t N, typename T>
  20271. Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last)
  20272. {
  20273. Return r = Return();
  20274. for (size_t i = 0; i < Return::size(); ++i) {
  20275. r[i] = static_cast<typename Return::EntryType>(last[i]);
  20276. }
  20277. return r;
  20278. }
  20279. template <typename Return, typename T, typename... From>
  20280. Vc_INTRINSIC_L Vc_CONST_L Return
  20281. simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R;
  20282. template <typename... Ts> struct are_all_types_equal;
  20283. template <typename T>
  20284. struct are_all_types_equal<T> : public std::integral_constant<bool, true>
  20285. {
  20286. };
  20287. template <typename T0, typename T1, typename... Ts>
  20288. struct are_all_types_equal<T0, T1, Ts...>
  20289. : public std::integral_constant<
  20290. bool, std::is_same<T0, T1>::value && are_all_types_equal<T1, Ts...>::value>
  20291. {
  20292. };
  20293. template <typename Return, typename... Ts>
  20294. Vc_INTRINSIC Vc_CONST Return
  20295. simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b);
  20296. template <typename Return, std::size_t offset, typename From, typename... Froms>
  20297. Vc_INTRINSIC Vc_CONST
  20298. enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
  20299. simd_cast_with_offset(const From &x, const Froms &... xs);
  20300. template <typename Return, std::size_t offset, typename From>
  20301. Vc_INTRINSIC Vc_CONST
  20302. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return>
  20303. simd_cast_with_offset(const From &x);
  20304. template <typename Return, std::size_t offset, typename From>
  20305. Vc_INTRINSIC Vc_CONST
  20306. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
  20307. ((Traits::isSimdArray<Return>::value &&
  20308. !Traits::isAtomicSimdArray<Return>::value) ||
  20309. (Traits::isSimdMaskArray<Return>::value &&
  20310. !Traits::isAtomicSimdMaskArray<Return>::value))),
  20311. Return>
  20312. simd_cast_with_offset(const From &x);
  20313. template <typename Return, std::size_t offset, typename From>
  20314. Vc_INTRINSIC Vc_CONST
  20315. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
  20316. ((Traits::isSimdArray<Return>::value &&
  20317. Traits::isAtomicSimdArray<Return>::value) ||
  20318. (Traits::isSimdMaskArray<Return>::value &&
  20319. Traits::isAtomicSimdMaskArray<Return>::value))),
  20320. Return>
  20321. simd_cast_with_offset(const From &x);
  20322. template <typename Return, std::size_t offset, typename From, typename... Froms>
  20323. Vc_INTRINSIC Vc_CONST enable_if<
  20324. (are_all_types_equal<From, Froms...>::value && From::Size <= offset), Return>
  20325. simd_cast_with_offset(const From &, const Froms &... xs)
  20326. {
  20327. return simd_cast_with_offset<Return, offset - From::Size>(xs...);
  20328. }
  20329. template <typename Return, std::size_t offset, typename From>
  20330. Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset(
  20331. const From &)
  20332. {
  20333. return Return(0);
  20334. }
  20335. template <typename T, typename... Ts> struct first_type_of_impl
  20336. {
  20337. using type = T;
  20338. };
  20339. template <typename... Ts> using first_type_of = typename first_type_of_impl<Ts...>::type;
  20340. template <typename Return, typename From>
  20341. Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x);
  20342. template <typename Return, typename... Froms>
  20343. Vc_INTRINSIC Vc_CONST
  20344. enable_if<(are_all_types_equal<Froms...>::value &&
  20345. sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
  20346. Return>
  20347. simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x);
  20348. template <typename Return, typename From, typename... Froms>
  20349. Vc_INTRINSIC Vc_CONST enable_if<
  20350. (are_all_types_equal<From, Froms...>::value &&
  20351. (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
  20352. Return>
  20353. simd_cast_drop_arguments(Froms... xs, From x, From);
  20354. template <typename Return, typename From>
  20355. Vc_INTRINSIC Vc_CONST
  20356. enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
  20357. simd_cast_drop_arguments(From x, From);
  20358. namespace
  20359. {
  20360. #ifdef Vc_DEBUG_SIMD_CAST
  20361. void debugDoNothing(const std::initializer_list<void *> &) {}
  20362. template <typename T0, typename... Ts>
  20363. inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0,
  20364. const Ts &... args)
  20365. {
  20366. std::cerr << prefix << arg0;
  20367. debugDoNothing({&(std::cerr << ", " << args)...});
  20368. std::cerr << suffix;
  20369. }
  20370. #else
  20371. template <typename T0, typename... Ts>
  20372. Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...)
  20373. {
  20374. }
  20375. #endif
  20376. }
  20377. template <size_t A, size_t B>
  20378. struct is_less : public std::integral_constant<bool, (A < B)> {
  20379. };
  20380. template <size_t N>
  20381. struct is_power_of_2 : public std::integral_constant<bool, ((N - 1) & N) == 0> {
  20382. };
  20383. #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
  20384. template <typename Return, typename T, typename A, typename... Froms> \
  20385. Vc_INTRINSIC Vc_CONST enable_if< \
  20386. (Traits::isAtomic##SimdArrayType_<Return>::value && \
  20387. is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
  20388. are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
  20389. !detail::is_fixed_size_abi<A>::value), \
  20390. Return> \
  20391. simd_cast(NativeType_<T, A> x, Froms... xs) \
  20392. { \
  20393. vc_debug_("simd_cast{1}(", ")\n", x, xs...); \
  20394. return {private_init, simd_cast<typename Return::storage_type>(x, xs...)}; \
  20395. } \
  20396. template <typename Return, typename T, typename A, typename... Froms> \
  20397. Vc_INTRINSIC Vc_CONST enable_if< \
  20398. (Traits::isAtomic##SimdArrayType_<Return>::value && \
  20399. !is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
  20400. are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
  20401. !detail::is_fixed_size_abi<A>::value), \
  20402. Return> \
  20403. simd_cast(NativeType_<T, A> x, Froms... xs) \
  20404. { \
  20405. vc_debug_("simd_cast{2}(", ")\n", x, xs...); \
  20406. return {simd_cast_without_last<Return, NativeType_<T, A>, Froms...>(x, xs...)}; \
  20407. } \
  20408. template <typename Return, typename T, typename A, typename... Froms> \
  20409. Vc_INTRINSIC Vc_CONST \
  20410. enable_if<(Traits::is##SimdArrayType_<Return>::value && \
  20411. !Traits::isAtomic##SimdArrayType_<Return>::value && \
  20412. is_less<Common::left_size<Return::Size>(), \
  20413. NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
  20414. are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
  20415. !detail::is_fixed_size_abi<A>::value), \
  20416. Return> \
  20417. simd_cast(NativeType_<T, A> x, Froms... xs) \
  20418. { \
  20419. vc_debug_("simd_cast{3}(", ")\n", x, xs...); \
  20420. using R0 = typename Return::storage_type0; \
  20421. using R1 = typename Return::storage_type1; \
  20422. return {simd_cast_drop_arguments<R0, Froms...>(x, xs...), \
  20423. simd_cast_with_offset<R1, R0::Size>(x, xs...)}; \
  20424. } \
  20425. template <typename Return, typename T, typename A, typename... Froms> \
  20426. Vc_INTRINSIC Vc_CONST \
  20427. enable_if<(Traits::is##SimdArrayType_<Return>::value && \
  20428. !Traits::isAtomic##SimdArrayType_<Return>::value && \
  20429. !is_less<Common::left_size<Return::Size>(), \
  20430. NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
  20431. are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
  20432. !detail::is_fixed_size_abi<A>::value), \
  20433. Return> \
  20434. simd_cast(NativeType_<T, A> x, Froms... xs) \
  20435. { \
  20436. vc_debug_("simd_cast{4}(", ")\n", x, xs...); \
  20437. using R0 = typename Return::storage_type0; \
  20438. using R1 = typename Return::storage_type1; \
  20439. return {simd_cast<R0>(x, xs...), R1(0)}; \
  20440. } \
  20441. Vc_NOTHING_EXPECTING_SEMICOLON
  20442. Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
  20443. Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
  20444. #undef Vc_SIMDARRAY_CASTS
  20445. #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
  20446. \
  20447. template <typename Return, int offset, typename T, typename A> \
  20448. Vc_INTRINSIC Vc_CONST \
  20449. enable_if<Traits::isAtomic##SimdArrayType_<Return>::value, Return> \
  20450. simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG0) \
  20451. { \
  20452. vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \
  20453. return {private_init, simd_cast<typename Return::storage_type, offset>(x)}; \
  20454. } \
  20455. \
  20456. template <typename Return, int offset, typename T, typename A> \
  20457. Vc_INTRINSIC Vc_CONST \
  20458. enable_if<(Traits::is##SimdArrayType_<Return>::value && \
  20459. !Traits::isAtomic##SimdArrayType_<Return>::value && \
  20460. Return::Size * offset + Common::left_size<Return::Size>() < \
  20461. NativeType_<T, A>::Size), \
  20462. Return> \
  20463. simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG1) \
  20464. { \
  20465. vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \
  20466. using R0 = typename Return::storage_type0; \
  20467. constexpr int entries_offset = offset * Return::Size; \
  20468. constexpr int entries_offset_right = entries_offset + R0::Size; \
  20469. return { \
  20470. simd_cast_with_offset<typename Return::storage_type0, entries_offset>(x), \
  20471. simd_cast_with_offset<typename Return::storage_type1, entries_offset_right>( \
  20472. x)}; \
  20473. } \
  20474. \
  20475. \
  20476. template <typename Return, int offset, typename T, typename A> \
  20477. Vc_INTRINSIC Vc_CONST \
  20478. enable_if<(Traits::is##SimdArrayType_<Return>::value && \
  20479. !Traits::isAtomic##SimdArrayType_<Return>::value && \
  20480. Return::Size * offset + Common::left_size<Return::Size>() >= \
  20481. NativeType_<T, A>::Size), \
  20482. Return> \
  20483. simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG2) \
  20484. { \
  20485. vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \
  20486. using R0 = typename Return::storage_type0; \
  20487. using R1 = typename Return::storage_type1; \
  20488. constexpr int entries_offset = offset * Return::Size; \
  20489. return {simd_cast_with_offset<R0, entries_offset>(x), R1(0)}; \
  20490. } \
  20491. Vc_NOTHING_EXPECTING_SEMICOLON
  20492. Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
  20493. Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
  20494. #undef Vc_SIMDARRAY_CASTS
  20495. #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
  20496. \
  20497. template <typename Return, typename T, std::size_t N, typename V, typename... From> \
  20498. Vc_INTRINSIC Vc_CONST \
  20499. enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
  20500. (sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \
  20501. !std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
  20502. Return> \
  20503. simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
  20504. { \
  20505. vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \
  20506. return simd_cast<Return>(internal_data(x0), internal_data(xs)...); \
  20507. } \
  20508. \
  20509. template <typename Return, typename T, std::size_t N, typename V, typename... From> \
  20510. Vc_INTRINSIC Vc_CONST \
  20511. enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
  20512. (sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \
  20513. !std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
  20514. Return> \
  20515. simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
  20516. { \
  20517. vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \
  20518. return simd_cast_without_last<Return, \
  20519. typename SimdArrayType_<T, N, V, N>::storage_type, \
  20520. typename From::storage_type...>( \
  20521. internal_data(x0), internal_data(xs)...); \
  20522. } \
  20523. \
  20524. template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
  20525. typename... From> \
  20526. Vc_INTRINSIC Vc_CONST enable_if< \
  20527. (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
  20528. !std::is_same<Return, SimdArrayType_<T, N, V, M>>::value && \
  20529. is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
  20530. Return> \
  20531. simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
  20532. { \
  20533. vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \
  20534. return simd_cast_interleaved_argument_order< \
  20535. Return, typename SimdArrayType_<T, N, V, M>::storage_type0, \
  20536. typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \
  20537. internal_data1(x0), internal_data1(xs)...); \
  20538. } \
  20539. \
  20540. template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
  20541. typename... From> \
  20542. Vc_INTRINSIC Vc_CONST enable_if< \
  20543. (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
  20544. !is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
  20545. Return> \
  20546. simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
  20547. { \
  20548. vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \
  20549. return simd_cast_without_last<Return, SimdArrayType_<T, N, V, M>, From...>( \
  20550. x0, xs...); \
  20551. } \
  20552. \
  20553. template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
  20554. typename... From> \
  20555. Vc_INTRINSIC Vc_CONST enable_if< \
  20556. (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
  20557. N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2<N>::value), \
  20558. Return> \
  20559. simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
  20560. { \
  20561. vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \
  20562. return simd_cast_impl_smaller_input<Return, N, SimdArrayType_<T, N, V, M>, \
  20563. From...>(x0, xs...); \
  20564. } \
  20565. \
  20566. template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
  20567. typename... From> \
  20568. Vc_INTRINSIC Vc_CONST enable_if< \
  20569. (N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
  20570. N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2<N>::value), \
  20571. Return> \
  20572. simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
  20573. { \
  20574. vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \
  20575. return simd_cast_impl_larger_input<Return, N, SimdArrayType_<T, N, V, M>, \
  20576. From...>(x0, xs...); \
  20577. } \
  20578. \
  20579. template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
  20580. Vc_INTRINSIC Vc_CONST \
  20581. enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2<N>::value), Return> \
  20582. simd_cast(const SimdArrayType_<T, N, V, M> &x) \
  20583. { \
  20584. vc_debug_("simd_cast{single bisectable}(", ")\n", x); \
  20585. return simd_cast<Return>(internal_data0(x)); \
  20586. } \
  20587. template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
  20588. Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \
  20589. N < 2 * Return::Size && is_power_of_2<N>::value), \
  20590. Return> \
  20591. simd_cast(const SimdArrayType_<T, N, V, M> &x) \
  20592. { \
  20593. vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \
  20594. return simd_cast<Return>(internal_data0(x), internal_data1(x)); \
  20595. } \
  20596. Vc_NOTHING_EXPECTING_SEMICOLON
  20597. Vc_SIMDARRAY_CASTS(SimdArray);
  20598. Vc_SIMDARRAY_CASTS(SimdMaskArray);
  20599. #undef Vc_SIMDARRAY_CASTS
  20600. template <class Return, class T, int N, class... Ts,
  20601. class = enable_if<!std::is_same<Return, fixed_size_simd<T, N>>::value>>
  20602. Vc_INTRINSIC Return simd_cast(const fixed_size_simd<T, N> &x, const Ts &... xs)
  20603. {
  20604. return simd_cast<Return>(static_cast<const SimdArray<T, N> &>(x),
  20605. static_cast<const SimdArray<T, N> &>(xs)...);
  20606. }
  20607. template <class Return, class T, int N, class... Ts,
  20608. class = enable_if<!std::is_same<Return, fixed_size_simd_mask<T, N>>::value>>
  20609. Vc_INTRINSIC Return simd_cast(const fixed_size_simd_mask<T, N> &x, const Ts &... xs)
  20610. {
  20611. return simd_cast<Return>(static_cast<const SimdMaskArray<T, N> &>(x),
  20612. static_cast<const SimdMaskArray<T, N> &>(xs)...);
  20613. }
  20614. #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
  20615. \
  20616. template <typename Return, int offset, typename T, std::size_t N, typename V, \
  20617. std::size_t M> \
  20618. Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \
  20619. const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG0) \
  20620. { \
  20621. vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \
  20622. return simd_cast<Return>(x); \
  20623. } \
  20624. \
  20625. template <typename Return, int offset, typename T, std::size_t N, typename V> \
  20626. Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \
  20627. const SimdArrayType_<T, N, V, N> &x Vc_DUMMY_ARG1) \
  20628. { \
  20629. vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \
  20630. return simd_cast<Return, offset>(internal_data(x)); \
  20631. } \
  20632. \
  20633. template <typename Return, int offset, typename T, std::size_t N, typename V, \
  20634. std::size_t M> \
  20635. Vc_INTRINSIC Vc_CONST \
  20636. enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
  20637. offset != 0 && Common::left_size<N>() % Return::Size == 0), \
  20638. Return> \
  20639. simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG2) \
  20640. { \
  20641. vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \
  20642. return simd_cast<Return, offset - Common::left_size<N>() / Return::Size>( \
  20643. internal_data1(x)); \
  20644. } \
  20645. \
  20646. template <typename Return, int offset, typename T, std::size_t N, typename V, \
  20647. std::size_t M> \
  20648. Vc_INTRINSIC Vc_CONST \
  20649. enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
  20650. offset != 0 && Common::left_size<N>() % Return::Size != 0), \
  20651. Return> \
  20652. simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG3) \
  20653. { \
  20654. vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \
  20655. return simd_cast_with_offset<Return, \
  20656. offset * Return::Size - Common::left_size<N>()>( \
  20657. internal_data1(x)); \
  20658. } \
  20659. \
  20660. template <typename Return, int offset, typename T, std::size_t N, typename V, \
  20661. std::size_t M> \
  20662. Vc_INTRINSIC Vc_CONST enable_if< \
  20663. (N != M && \
  20664. offset != 0 && (offset + 1) * Return::Size <= Common::left_size<N>()), \
  20665. Return> \
  20666. simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG4) \
  20667. { \
  20668. vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \
  20669. return simd_cast<Return, offset>(internal_data0(x)); \
  20670. } \
  20671. \
  20672. template <typename Return, int offset, typename T, std::size_t N, typename V, \
  20673. std::size_t M> \
  20674. Vc_INTRINSIC Vc_CONST \
  20675. enable_if<(N != M && (offset * Return::Size < Common::left_size<N>()) && \
  20676. offset != 0 && (offset + 1) * Return::Size > Common::left_size<N>()), \
  20677. Return> \
  20678. simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG5) \
  20679. { \
  20680. vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \
  20681. using R = typename Return::EntryType; \
  20682. Return r = Return(0); \
  20683. for (std::size_t i = offset * Return::Size; \
  20684. i < std::min(N, (offset + 1) * Return::Size); ++i) { \
  20685. r[i - offset * Return::Size] = static_cast<R>(x[i]); \
  20686. } \
  20687. return r; \
  20688. } \
  20689. Vc_NOTHING_EXPECTING_SEMICOLON
  20690. Vc_SIMDARRAY_CASTS(SimdArray);
  20691. Vc_SIMDARRAY_CASTS(SimdMaskArray);
  20692. #undef Vc_SIMDARRAY_CASTS
  20693. template <typename Return, typename From>
  20694. Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x)
  20695. {
  20696. return simd_cast<Return>(x);
  20697. }
  20698. template <typename Return, typename... Froms>
  20699. Vc_INTRINSIC Vc_CONST
  20700. enable_if<(are_all_types_equal<Froms...>::value &&
  20701. sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
  20702. Return>
  20703. simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x)
  20704. {
  20705. return simd_cast<Return>(xs..., x);
  20706. }
  20707. template <typename Return, typename From, typename... Froms>
  20708. Vc_INTRINSIC Vc_CONST enable_if<
  20709. (are_all_types_equal<From, Froms...>::value &&
  20710. (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
  20711. Return>
  20712. simd_cast_drop_arguments(Froms... xs, From x, From)
  20713. {
  20714. return simd_cast_drop_arguments<Return, Froms...>(xs..., x);
  20715. }
  20716. template <typename Return, typename From>
  20717. Vc_INTRINSIC Vc_CONST
  20718. enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
  20719. simd_cast_drop_arguments(From x, From)
  20720. {
  20721. return simd_cast_drop_arguments<Return>(x);
  20722. }
  20723. template <typename Return, std::size_t offset, typename From>
  20724. Vc_INTRINSIC Vc_CONST
  20725. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0),
  20726. Return> simd_cast_with_offset(const From &x)
  20727. {
  20728. return simd_cast<Return, offset / Return::Size>(x);
  20729. }
  20730. template <typename Return, std::size_t offset, typename From>
  20731. Vc_INTRINSIC Vc_CONST
  20732. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
  20733. ((Traits::isSimdArray<Return>::value &&
  20734. !Traits::isAtomicSimdArray<Return>::value) ||
  20735. (Traits::isSimdMaskArray<Return>::value &&
  20736. !Traits::isAtomicSimdMaskArray<Return>::value))),
  20737. Return>
  20738. simd_cast_with_offset(const From &x)
  20739. {
  20740. using R0 = typename Return::storage_type0;
  20741. using R1 = typename Return::storage_type1;
  20742. return {simd_cast_with_offset<R0, offset>(x),
  20743. simd_cast_with_offset<R1, offset + R0::Size>(x)};
  20744. }
  20745. template <typename Return, std::size_t offset, typename From>
  20746. Vc_INTRINSIC Vc_CONST
  20747. enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
  20748. ((Traits::isSimdArray<Return>::value &&
  20749. Traits::isAtomicSimdArray<Return>::value) ||
  20750. (Traits::isSimdMaskArray<Return>::value &&
  20751. Traits::isAtomicSimdMaskArray<Return>::value))),
  20752. Return>
  20753. simd_cast_with_offset(const From &x)
  20754. {
  20755. return simd_cast<Return, offset / Return::Size>(x.shifted(offset % Return::Size));
  20756. }
  20757. template <typename Return, std::size_t offset, typename From, typename... Froms>
  20758. Vc_INTRINSIC Vc_CONST
  20759. enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
  20760. simd_cast_with_offset(const From &x, const Froms &... xs)
  20761. {
  20762. return simd_cast<Return>(x, xs...);
  20763. }
  20764. template <typename Return, typename T, typename... From>
  20765. Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &)
  20766. {
  20767. return simd_cast<Return>(xs...);
  20768. }
  20769. #ifdef Vc_MSVC
  20770. template <std::size_t I, typename T0>
  20771. Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &)
  20772. {
  20773. return a0;
  20774. }
  20775. template <std::size_t I, typename T0>
  20776. Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0)
  20777. {
  20778. return b0;
  20779. }
  20780. #endif
  20781. template <std::size_t I, typename T0, typename... Ts>
  20782. Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0,
  20783. const Ts &...,
  20784. const T0 &,
  20785. const Ts &...)
  20786. {
  20787. return a0;
  20788. }
  20789. template <std::size_t I, typename T0, typename... Ts>
  20790. Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &,
  20791. const Ts &...,
  20792. const T0 &b0,
  20793. const Ts &...)
  20794. {
  20795. return b0;
  20796. }
  20797. template <std::size_t I, typename T0, typename... Ts>
  20798. Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &,
  20799. const Ts &... a,
  20800. const T0 &,
  20801. const Ts &... b)
  20802. {
  20803. return extract_interleaved<I - 2, Ts...>(a..., b...);
  20804. }
  20805. template <typename Return, typename... Ts, std::size_t... Indexes>
  20806. Vc_INTRINSIC Vc_CONST Return
  20807. simd_cast_interleaved_argument_order_1(index_sequence<Indexes...>, const Ts &... a,
  20808. const Ts &... b)
  20809. {
  20810. return simd_cast<Return>(extract_interleaved<Indexes, Ts...>(a..., b...)...);
  20811. }
  20812. template <typename Return, typename... Ts>
  20813. Vc_INTRINSIC Vc_CONST Return
  20814. simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b)
  20815. {
  20816. using seq = make_index_sequence<sizeof...(Ts)*2>;
  20817. return simd_cast_interleaved_argument_order_1<Return, Ts...>(seq(), a..., b...);
  20818. }
  20819. #define Vc_CONDITIONAL_ASSIGN(name_,op_) \
  20820. template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M, \
  20821. typename U> \
  20822. Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
  20823. SimdArray<T, N, V, VN> &lhs, M &&mask, U &&rhs) \
  20824. { \
  20825. lhs(mask) op_ rhs; \
  20826. } \
  20827. Vc_NOTHING_EXPECTING_SEMICOLON
  20828. Vc_CONDITIONAL_ASSIGN( Assign, =);
  20829. Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
  20830. Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
  20831. Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
  20832. Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
  20833. Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
  20834. Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
  20835. Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
  20836. Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
  20837. Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
  20838. Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
  20839. #undef Vc_CONDITIONAL_ASSIGN
  20840. #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
  20841. template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M> \
  20842. Vc_INTRINSIC enable_if<O == Operator::name_, SimdArray<T, N, V, VN>> \
  20843. conditional_assign(SimdArray<T, N, V, VN> &lhs, M &&mask) \
  20844. { \
  20845. return expr_; \
  20846. } \
  20847. Vc_NOTHING_EXPECTING_SEMICOLON
  20848. Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
  20849. Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
  20850. Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
  20851. Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
  20852. #undef Vc_CONDITIONAL_ASSIGN
  20853. namespace Common
  20854. {
  20855. template <typename T, size_t N, typename V>
  20856. inline void transpose_impl(
  20857. TransposeTag<4, 4>, SimdArray<T, N, V, N> *Vc_RESTRICT r[],
  20858. const TransposeProxy<SimdArray<T, N, V, N>, SimdArray<T, N, V, N>,
  20859. SimdArray<T, N, V, N>, SimdArray<T, N, V, N>> &proxy)
  20860. {
  20861. V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
  20862. &internal_data(*r[2]), &internal_data(*r[3])};
  20863. transpose_impl(TransposeTag<4, 4>(), &r2[0],
  20864. TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
  20865. internal_data(std::get<1>(proxy.in)),
  20866. internal_data(std::get<2>(proxy.in)),
  20867. internal_data(std::get<3>(proxy.in))});
  20868. }
  20869. template <typename T, typename V>
  20870. inline void transpose_impl(
  20871. TransposeTag<2, 4>, SimdArray<T, 4, V, 1> *Vc_RESTRICT r[],
  20872. const TransposeProxy<SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>,
  20873. SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>> &proxy)
  20874. {
  20875. auto &lo = *r[0];
  20876. auto &hi = *r[1];
  20877. internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in));
  20878. internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in));
  20879. internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in));
  20880. internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in));
  20881. internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in));
  20882. internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in));
  20883. internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in));
  20884. internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in));
  20885. }
  20886. template <typename T, typename V>
  20887. inline void transpose_impl(
  20888. TransposeTag<4, 4>, SimdArray<T, 1, V, 1> *Vc_RESTRICT r[],
  20889. const TransposeProxy<SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>,
  20890. SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>> &proxy)
  20891. {
  20892. V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
  20893. &internal_data(*r[2]), &internal_data(*r[3])};
  20894. transpose_impl(TransposeTag<4, 4>(), &r2[0],
  20895. TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
  20896. internal_data(std::get<1>(proxy.in)),
  20897. internal_data(std::get<2>(proxy.in)),
  20898. internal_data(std::get<3>(proxy.in))});
  20899. }
  20900. template <typename T, size_t N, typename V>
  20901. inline void transpose_impl(
  20902. TransposeTag<4, 4>, SimdArray<T, N, V, 1> *Vc_RESTRICT r[],
  20903. const TransposeProxy<SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>,
  20904. SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>> &proxy)
  20905. {
  20906. SimdArray<T, N, V, 1> *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]};
  20907. SimdArray<T, N, V, 1> *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]};
  20908. using H = SimdArray<T, 2>;
  20909. transpose_impl(TransposeTag<2, 4>(), &r0[0],
  20910. TransposeProxy<H, H, H, H>{internal_data0(std::get<0>(proxy.in)),
  20911. internal_data0(std::get<1>(proxy.in)),
  20912. internal_data0(std::get<2>(proxy.in)),
  20913. internal_data0(std::get<3>(proxy.in))});
  20914. transpose_impl(TransposeTag<2, 4>(), &r1[0],
  20915. TransposeProxy<H, H, H, H>{internal_data1(std::get<0>(proxy.in)),
  20916. internal_data1(std::get<1>(proxy.in)),
  20917. internal_data1(std::get<2>(proxy.in)),
  20918. internal_data1(std::get<3>(proxy.in))});
  20919. }
  20920. }
  20921. namespace Detail
  20922. {
  20923. template <class T, size_t N, class V, size_t VSizeof>
  20924. struct InterleaveImpl<SimdArray<T, N, V, N>, N, VSizeof> {
  20925. template <class I, class... VV>
  20926. static Vc_INTRINSIC void interleave(T *const data, const I &i, const VV &... vv)
  20927. {
  20928. InterleaveImpl<V, N, VSizeof>::interleave(data, i, internal_data(vv)...);
  20929. }
  20930. template <class I, class... VV>
  20931. static Vc_INTRINSIC void deinterleave(T const *const data, const I &i, VV &... vv)
  20932. {
  20933. InterleaveImpl<V, N, VSizeof>::deinterleave(data, i, internal_data(vv)...);
  20934. }
  20935. };
  20936. }
  20937. }
  20938. namespace std
  20939. {
  20940. template <typename T, size_t N, typename V, size_t VN>
  20941. struct numeric_limits<Vc::SimdArray<T, N, V, VN>> : public numeric_limits<T> {
  20942. private:
  20943. using R = Vc::SimdArray<T, N, V, VN>;
  20944. public:
  20945. static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits<T>::max(); }
  20946. static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits<T>::min(); }
  20947. static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept
  20948. {
  20949. return numeric_limits<T>::lowest();
  20950. }
  20951. static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept
  20952. {
  20953. return numeric_limits<T>::epsilon();
  20954. }
  20955. static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept
  20956. {
  20957. return numeric_limits<T>::round_error();
  20958. }
  20959. static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept
  20960. {
  20961. return numeric_limits<T>::infinity();
  20962. }
  20963. static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept
  20964. {
  20965. return numeric_limits<T>::quiet_NaN();
  20966. }
  20967. static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept
  20968. {
  20969. return numeric_limits<T>::signaling_NaN();
  20970. }
  20971. static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept
  20972. {
  20973. return numeric_limits<T>::denorm_min();
  20974. }
  20975. };
  20976. }
  20977. #endif
  20978. namespace Vc_VERSIONED_NAMESPACE
  20979. {
  20980. namespace Detail
  20981. {
  20982. template <typename T, typename Abi, typename U>
  20983. enable_if<!std::is_same<T, U>::value, U> is_convertible_to_any_vector(Vector<U, Abi>);
  20984. template <typename T, typename Abi> T is_convertible_to_any_vector(Vector<T, Abi>);
  20985. template <typename T, typename U, bool = std::is_integral<T>::value,
  20986. bool = std::is_integral<U>::value>
  20987. struct FundamentalReturnType;
  20988. template <class T, class U>
  20989. using fundamental_return_t = typename FundamentalReturnType<T, U>::type;
  20990. template <typename T, typename U> struct FundamentalReturnType<T, U, false, false> {
  20991. using type = typename std::conditional<
  20992. std::is_arithmetic<U>::value,
  20993. typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type,
  20994. T>::type;
  20995. };
  20996. template <typename T, typename U> struct FundamentalReturnType<T, U, true, false> {
  20997. using type = typename std::conditional<
  20998. std::is_arithmetic<U>::value, U,
  20999. T>::type;
  21000. };
  21001. template <typename T, typename U> struct FundamentalReturnType<T, U, false, true> {
  21002. using type = T;
  21003. };
  21004. template <typename T> struct my_make_signed : public std::make_signed<T> {
  21005. };
  21006. template <> struct my_make_signed<bool> {
  21007. using type = bool;
  21008. };
  21009. template <typename TT, typename UU>
  21010. struct higher_conversion_rank {
  21011. template <typename A>
  21012. using fix_sign =
  21013. typename std::conditional<(std::is_unsigned<TT>::value ||
  21014. std::is_unsigned<UU>::value),
  21015. typename std::make_unsigned<A>::type, A>::type;
  21016. using T = typename my_make_signed<TT>::type;
  21017. using U = typename my_make_signed<UU>::type;
  21018. template <typename Test, typename Otherwise>
  21019. using c = typename std::conditional<std::is_same<T, Test>::value ||
  21020. std::is_same<U, Test>::value,
  21021. Test, Otherwise>::type;
  21022. using type = fix_sign<c<long long, c<long, c<int, c<short, c<signed char, void>>>>>>;
  21023. };
  21024. template <typename T, typename U> struct FundamentalReturnType<T, U, true, true> {
  21025. template <bool B, class Then, class E>
  21026. using c = typename std::conditional<B, Then, E>::type;
  21027. using type =
  21028. c<(sizeof(T) > sizeof(U)), T,
  21029. c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank<T, U>::type>>;
  21030. };
  21031. template <class V, class T, class Tq, class = void> struct ReturnTypeImpl {
  21032. };
  21033. template <class T, class U, class Abi, class Uq>
  21034. struct ReturnTypeImpl<Vector<T, Abi>, Vector<U, Abi>, Uq, void> {
  21035. using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
  21036. };
  21037. template <class T, class Abi, class Uq>
  21038. struct ReturnTypeImpl<Vector<T, Abi>, int, Uq, void> {
  21039. using type = Vc::Vector<T, Abi>;
  21040. };
  21041. template <class T, class Abi, class Uq>
  21042. struct ReturnTypeImpl<Vector<T, Abi>, uint, Uq, void> {
  21043. using type = Vc::Vector<
  21044. typename std::conditional<std::is_integral<T>::value, std::make_unsigned<T>,
  21045. std::enable_if<true, T>>::type::type,
  21046. Abi>;
  21047. };
  21048. template <class T, class U, class Abi, class Uq>
  21049. struct ReturnTypeImpl<
  21050. Vector<T, Abi>, U, Uq,
  21051. enable_if<!std::is_class<U>::value && !std::is_same<U, int>::value &&
  21052. !std::is_same<U, uint>::value &&
  21053. Traits::is_valid_vector_argument<fundamental_return_t<T, U>>::value,
  21054. void>> {
  21055. using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
  21056. };
  21057. template <class T, class U, class Abi, class Uq>
  21058. struct ReturnTypeImpl<
  21059. Vector<T, Abi>, U, Uq,
  21060. enable_if<std::is_class<U>::value && !Traits::is_simd_vector<U>::value &&
  21061. Traits::is_valid_vector_argument<decltype(
  21062. is_convertible_to_any_vector<T, Abi>(std::declval<Uq>()))>::value,
  21063. void>> {
  21064. using type =
  21065. Vc::Vector<fundamental_return_t<T, decltype(is_convertible_to_any_vector<T, Abi>(
  21066. std::declval<Uq>()))>,
  21067. Abi>;
  21068. };
  21069. template <class V, class Tq, class T = remove_cvref_t<Tq>>
  21070. using ReturnType = typename ReturnTypeImpl<V, T, Tq>::type;
  21071. template <class T> struct is_a_type : public std::true_type {
  21072. };
  21073. #ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS
  21074. #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true
  21075. #else
  21076. #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \
  21077. Detail::is_a_type<decltype(std::declval<typename R::value_type>() \
  21078. op_ std::declval<typename R::value_type>())>::value
  21079. #endif
  21080. }
  21081. #define Vc_GENERIC_OPERATOR(op_) \
  21082. template <class T, class Abi, class U, \
  21083. class R = Detail::ReturnType<Vector<T, Abi>, U>> \
  21084. Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
  21085. std::is_convertible<Vector<T, Abi>, R>::value && \
  21086. std::is_convertible<U, R>::value, \
  21087. R> \
  21088. operator op_(Vector<T, Abi> x, U &&y) \
  21089. { \
  21090. return Detail::operator op_(R(x), R(std::forward<U>(y))); \
  21091. } \
  21092. template <class T, class Abi, class U, \
  21093. class R = Detail::ReturnType<Vector<T, Abi>, U>> \
  21094. Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
  21095. !Traits::is_simd_vector<U>::value && \
  21096. std::is_convertible<Vector<T, Abi>, R>::value && \
  21097. std::is_convertible<U, R>::value, \
  21098. R> \
  21099. operator op_(U &&x, Vector<T, Abi> y) \
  21100. { \
  21101. return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
  21102. } \
  21103. template <class T, class Abi, class U, \
  21104. class R = Detail::ReturnType<Vector<T, Abi>, U>> \
  21105. Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
  21106. std::is_convertible<Vector<T, Abi>, R>::value && \
  21107. std::is_convertible<U, R>::value, \
  21108. Vector<T, Abi> &> \
  21109. operator op_##=(Vector<T, Abi> &x, U &&y) \
  21110. { \
  21111. x = Detail::operator op_(R(x), R(std::forward<U>(y))); \
  21112. return x; \
  21113. }
  21114. #define Vc_LOGICAL_OPERATOR(op_) \
  21115. template <class T, class Abi> \
  21116. Vc_ALWAYS_INLINE typename Vector<T, Abi>::Mask operator op_(Vector<T, Abi> x, \
  21117. Vector<T, Abi> y) \
  21118. { \
  21119. return !!x op_ !!y; \
  21120. } \
  21121. template <class T, class Abi, class U> \
  21122. Vc_ALWAYS_INLINE \
  21123. enable_if<std::is_convertible<Vector<T, Abi>, Vector<U, Abi>>::value && \
  21124. std::is_convertible<Vector<U, Abi>, Vector<T, Abi>>::value, \
  21125. typename Detail::ReturnType<Vector<T, Abi>, Vector<U, Abi>>::Mask> \
  21126. operator op_(Vector<T, Abi> x, Vector<U, Abi> y) \
  21127. { \
  21128. return !!x op_ !!y; \
  21129. } \
  21130. template <class T, class Abi, class U> \
  21131. Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
  21132. typename Vector<T, Abi>::Mask> \
  21133. operator op_(Vector<T, Abi> x, U &&y) \
  21134. { \
  21135. using M = typename Vector<T, Abi>::Mask; \
  21136. return !!x op_ M(!!std::forward<U>(y)); \
  21137. } \
  21138. template <class T, class Abi, class U> \
  21139. Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
  21140. typename Vector<T, Abi>::Mask> \
  21141. operator op_(U &&x, Vector<T, Abi> y) \
  21142. { \
  21143. using M = typename Vector<T, Abi>::Mask; \
  21144. return M(!!std::forward<U>(x)) op_ !!y; \
  21145. }
  21146. #define Vc_COMPARE_OPERATOR(op_) \
  21147. template <class T, class Abi, class U, \
  21148. class R = Detail::ReturnType<Vector<T, Abi>, U>> \
  21149. Vc_ALWAYS_INLINE enable_if<std::is_convertible<Vector<T, Abi>, R>::value && \
  21150. std::is_convertible<U, R>::value, \
  21151. typename R::Mask> \
  21152. operator op_(Vector<T, Abi> x, U &&y) \
  21153. { \
  21154. return Detail::operator op_(R(x), R(std::forward<U>(y))); \
  21155. } \
  21156. template <class T, class Abi, class U, \
  21157. class R = Detail::ReturnType<Vector<T, Abi>, U>> \
  21158. Vc_ALWAYS_INLINE \
  21159. enable_if<!Traits::is_simd_vector_internal<remove_cvref_t<U>>::value && \
  21160. std::is_convertible<Vector<T, Abi>, R>::value && \
  21161. std::is_convertible<U, R>::value, \
  21162. typename R::Mask> \
  21163. operator op_(U &&x, Vector<T, Abi> y) \
  21164. { \
  21165. return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
  21166. }
  21167. Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR);
  21168. Vc_ALL_BINARY (Vc_GENERIC_OPERATOR);
  21169. Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR);
  21170. Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR);
  21171. #undef Vc_LOGICAL_OPERATOR
  21172. #undef Vc_GENERIC_OPERATOR
  21173. #undef Vc_COMPARE_OPERATOR
  21174. #undef Vc_INVALID_OPERATOR
  21175. }
  21176. #endif
  21177. #ifndef VC_COMMON_ALIGNEDBASE_H_
  21178. #define VC_COMMON_ALIGNEDBASE_H_
  21179. namespace Vc_VERSIONED_NAMESPACE
  21180. {
  21181. namespace Detail
  21182. {
  21183. template <typename T> constexpr T max(T a) { return a; }
  21184. template <typename T, typename... Ts> constexpr T max(T a, T b, Ts... rest)
  21185. {
  21186. return a > b ? max(a, rest...) : max(b, rest...);
  21187. }
  21188. }
  21189. namespace Common
  21190. {
  21191. template <std::size_t> Vc_INTRINSIC void *aligned_malloc(std::size_t);
  21192. Vc_ALWAYS_INLINE void free(void *);
  21193. }
  21194. template <std::size_t Alignment> struct alignas(Alignment) AlignedBase
  21195. {
  21196. Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment);
  21197. };
  21198. using VectorAlignedBase = AlignedBase<
  21199. Detail::max(alignof(Vector<float>), alignof(Vector<double>), alignof(Vector<ullong>),
  21200. alignof(Vector<llong>), alignof(Vector<ulong>), alignof(Vector<long>),
  21201. alignof(Vector<uint>), alignof(Vector<int>), alignof(Vector<ushort>),
  21202. alignof(Vector<short>), alignof(Vector<uchar>), alignof(Vector<schar>))>;
  21203. template <typename V> using VectorAlignedBaseT = AlignedBase<alignof(V)>;
  21204. using MemoryAlignedBase = AlignedBase<
  21205. Detail::max(Vector<float>::MemoryAlignment, Vector<double>::MemoryAlignment,
  21206. Vector<ullong>::MemoryAlignment, Vector<llong>::MemoryAlignment,
  21207. Vector<ulong>::MemoryAlignment, Vector<long>::MemoryAlignment,
  21208. Vector<uint>::MemoryAlignment, Vector<int>::MemoryAlignment,
  21209. Vector<ushort>::MemoryAlignment, Vector<short>::MemoryAlignment,
  21210. Vector<uchar>::MemoryAlignment, Vector<schar>::MemoryAlignment)>;
  21211. template <typename V> using MemoryAlignedBaseT = AlignedBase<V::MemoryAlignment>;
  21212. }
  21213. #endif
  21214. namespace Vc_VERSIONED_NAMESPACE {
  21215. constexpr std::size_t VectorAlignment = alignof(VectorAlignedBase);
  21216. constexpr std::size_t MemoryAlignment = alignof(MemoryAlignedBase);
  21217. }
  21218. #define Vc_VECTOR_DECLARED_ 1
  21219. #ifndef VC_SCALAR_DEINTERLEAVE_H_
  21220. #define VC_SCALAR_DEINTERLEAVE_H_
  21221. namespace Vc_VERSIONED_NAMESPACE
  21222. {
  21223. namespace Detail
  21224. {
  21225. template <typename T, typename M, typename A>
  21226. Vc_ALWAYS_INLINE void deinterleave(Scalar::Vector<T> &a, Scalar::Vector<T> &b,
  21227. const M *mem, A)
  21228. {
  21229. a = mem[0];
  21230. b = mem[1];
  21231. }
  21232. Vc_ALWAYS_INLINE void prefetchForOneRead(const void *, VectorAbi::Scalar) {}
  21233. Vc_ALWAYS_INLINE void prefetchForModify(const void *, VectorAbi::Scalar) {}
  21234. Vc_ALWAYS_INLINE void prefetchClose(const void *, VectorAbi::Scalar) {}
  21235. Vc_ALWAYS_INLINE void prefetchMid(const void *, VectorAbi::Scalar) {}
  21236. Vc_ALWAYS_INLINE void prefetchFar(const void *, VectorAbi::Scalar) {}
  21237. }
  21238. }
  21239. #endif
  21240. #ifndef VC_SCALAR_MATH_H_
  21241. #define VC_SCALAR_MATH_H_
  21242. #include <cstdlib>
  21243. namespace Vc_VERSIONED_NAMESPACE
  21244. {
  21245. Vc_INTRINSIC Scalar::float_v copysign(Scalar::float_v mag, Scalar::float_v sign)
  21246. {
  21247. union {
  21248. float f;
  21249. unsigned int i;
  21250. } value, s;
  21251. value.f = mag.data();
  21252. s.f = sign.data();
  21253. value.i = (s.i & 0x80000000u) | (value.i & 0x7fffffffu);
  21254. return Scalar::float_v{value.f};
  21255. }
  21256. Vc_INTRINSIC Vc_CONST Scalar::double_v copysign(Scalar::double_v mag,
  21257. Scalar::double_v sign)
  21258. {
  21259. union {
  21260. double f;
  21261. unsigned long long i;
  21262. } value, s;
  21263. value.f = mag.data();
  21264. s.f = sign.data();
  21265. value.i = (s.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull);
  21266. return Scalar::double_v{value.f};
  21267. }
  21268. #define Vc_MINMAX(V) \
  21269. static Vc_ALWAYS_INLINE Scalar::V min(const Scalar::V &x, const Scalar::V &y) \
  21270. { \
  21271. return Scalar::V(std::min(x.data(), y.data())); \
  21272. } \
  21273. static Vc_ALWAYS_INLINE Scalar::V max(const Scalar::V &x, const Scalar::V &y) \
  21274. { \
  21275. return Scalar::V(std::max(x.data(), y.data())); \
  21276. }
  21277. Vc_ALL_VECTOR_TYPES(Vc_MINMAX);
  21278. #undef Vc_MINMAX
  21279. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sqrt (const Scalar::Vector<T> &x)
  21280. {
  21281. return Scalar::Vector<T>(std::sqrt(x.data()));
  21282. }
  21283. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> rsqrt(const Scalar::Vector<T> &x)
  21284. {
  21285. const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / std::sqrt(x.data()));
  21286. }
  21287. template <typename T,
  21288. typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
  21289. std::is_same<T, short>::value ||
  21290. std::is_same<T, int>::value>>
  21291. Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector<T> abs(Scalar::Vector<T> x)
  21292. {
  21293. return std::abs(x.data());
  21294. }
  21295. template<typename T> static Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<T> &x, Scalar::Vector<T> *sin, Scalar::Vector<T> *cos)
  21296. {
  21297. #if defined(_WIN32) || defined(__APPLE__)
  21298. sin->data() = std::sin(x.data());
  21299. cos->data() = std::cos(x.data());
  21300. #elif Vc_HAS_BUILTIN(__builtin_sincosf) || defined Vc_GCC
  21301. __builtin_sincosf(x.data(), &sin->data(), &cos->data());
  21302. #else
  21303. sincosf(x.data(), &sin->data(), &cos->data());
  21304. #endif
  21305. }
  21306. template<> Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<double> &x, Scalar::Vector<double> *sin, Scalar::Vector<double> *cos)
  21307. {
  21308. #if defined(_WIN32) || defined(__APPLE__)
  21309. sin->data() = std::sin(x.data());
  21310. cos->data() = std::cos(x.data());
  21311. #elif Vc_HAS_BUILTIN(__builtin_sincos) || defined Vc_GCC
  21312. __builtin_sincos(x.data(), &sin->data(), &cos->data());
  21313. #else
  21314. ::sincos(x.data(), &sin->data(), &cos->data());
  21315. #endif
  21316. }
  21317. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sin (const Scalar::Vector<T> &x)
  21318. {
  21319. return Scalar::Vector<T>(std::sin(x.data()));
  21320. }
  21321. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> asin (const Scalar::Vector<T> &x)
  21322. {
  21323. return Scalar::Vector<T>(std::asin(x.data()));
  21324. }
  21325. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> cos (const Scalar::Vector<T> &x)
  21326. {
  21327. return Scalar::Vector<T>(std::cos(x.data()));
  21328. }
  21329. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log (const Scalar::Vector<T> &x)
  21330. {
  21331. return Scalar::Vector<T>(std::log(x.data()));
  21332. }
  21333. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log10(const Scalar::Vector<T> &x)
  21334. {
  21335. return Scalar::Vector<T>(std::log10(x.data()));
  21336. }
  21337. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log2(const Scalar::Vector<T> &x)
  21338. {
  21339. return Scalar::Vector<T>(std::log2(x.data()));
  21340. }
  21341. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> exp (const Scalar::Vector<T> &x)
  21342. {
  21343. return Scalar::Vector<T>(std::exp(x.data()));
  21344. }
  21345. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan (const Scalar::Vector<T> &x)
  21346. {
  21347. return Scalar::Vector<T>(std::atan( x.data() ));
  21348. }
  21349. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan2(const Scalar::Vector<T> &x, const Scalar::Vector<T> &y)
  21350. {
  21351. return Scalar::Vector<T>(std::atan2( x.data(), y.data() ));
  21352. }
  21353. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> trunc(const Scalar::Vector<T> &x)
  21354. {
  21355. return std::trunc(x.data());
  21356. }
  21357. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> floor(const Scalar::Vector<T> &x)
  21358. {
  21359. return Scalar::Vector<T>(std::floor(x.data()));
  21360. }
  21361. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> ceil(const Scalar::Vector<T> &x)
  21362. {
  21363. return Scalar::Vector<T>(std::ceil(x.data()));
  21364. }
  21365. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> round(const Scalar::Vector<T> &x)
  21366. {
  21367. return x;
  21368. }
  21369. namespace
  21370. {
  21371. template<typename T> bool _realIsEvenHalf(T x) {
  21372. const T two = 2;
  21373. const T half = 0.5;
  21374. const T f = std::floor(x * half) * two;
  21375. return (x - f) == half;
  21376. }
  21377. }
  21378. template<> Vc_ALWAYS_INLINE Scalar::Vector<float> round(const Scalar::Vector<float> &x)
  21379. {
  21380. return Scalar::float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f));
  21381. }
  21382. template<> Vc_ALWAYS_INLINE Scalar::Vector<double> round(const Scalar::Vector<double> &x)
  21383. {
  21384. return Scalar::double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. ));
  21385. }
  21386. template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> reciprocal(const Scalar::Vector<T> &x)
  21387. {
  21388. const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / x.data());
  21389. }
  21390. #ifdef isfinite
  21391. #undef isfinite
  21392. #endif
  21393. #ifdef isnan
  21394. #undef isnan
  21395. #endif
  21396. template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isfinite(const Scalar::Vector<T> &x)
  21397. {
  21398. return typename Vector<T, VectorAbi::Scalar>::Mask(
  21399. #ifdef _MSC_VER
  21400. !!_finite(x.data())
  21401. #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
  21402. ::isfinite(x.data())
  21403. #else
  21404. std::isfinite(x.data())
  21405. #endif
  21406. );
  21407. }
  21408. template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isinf(const Scalar::Vector<T> &x)
  21409. {
  21410. return typename Vector<T, VectorAbi::Scalar>::Mask(std::isinf(x.data()));
  21411. }
  21412. template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isnan(const Scalar::Vector<T> &x)
  21413. {
  21414. return typename Vector<T, VectorAbi::Scalar>::Mask(
  21415. #ifdef _MSC_VER
  21416. !!_isnan(x.data())
  21417. #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
  21418. ::isnan(x.data())
  21419. #else
  21420. std::isnan(x.data())
  21421. #endif
  21422. );
  21423. }
  21424. Vc_ALWAYS_INLINE Scalar::Vector<float> frexp(Scalar::Vector<float> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
  21425. return Scalar::float_v(std::frexp(x.data(), &internal_data(*e).data()));
  21426. }
  21427. Vc_ALWAYS_INLINE Scalar::Vector<double> frexp(Scalar::Vector<double> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
  21428. return Scalar::double_v(std::frexp(x.data(), &internal_data(*e).data()));
  21429. }
  21430. Vc_ALWAYS_INLINE Scalar::Vector<float> ldexp(Scalar::Vector<float> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
  21431. return Scalar::float_v(std::ldexp(x.data(), internal_data(e).data()));
  21432. }
  21433. Vc_ALWAYS_INLINE Scalar::Vector<double> ldexp(Scalar::Vector<double> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
  21434. return Scalar::double_v(std::ldexp(x.data(), internal_data(e).data()));
  21435. }
  21436. template <typename T>
  21437. Vc_ALWAYS_INLINE Vector<T, VectorAbi::Scalar> fma(Vector<T, VectorAbi::Scalar> a,
  21438. Vector<T, VectorAbi::Scalar> b,
  21439. Vector<T, VectorAbi::Scalar> c)
  21440. {
  21441. if (std::is_integral<T>::value) {
  21442. return a * b + c;
  21443. } else {
  21444. return std::fma(a.data(), b.data(), c.data());
  21445. }
  21446. }
  21447. }
  21448. #endif
  21449. #ifndef Vc_SCALAR_SIMD_CAST_CALLER_TCC_
  21450. #define Vc_SCALAR_SIMD_CAST_CALLER_TCC_
  21451. namespace Vc_VERSIONED_NAMESPACE
  21452. {
  21453. #if Vc_IS_VERSION_1
  21454. template <typename T>
  21455. template <typename U>
  21456. Vc_INTRINSIC Mask<T, VectorAbi::Scalar>::Mask(
  21457. U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
  21458. : Mask(simd_cast<Mask>(std::forward<U>(rhs)))
  21459. {
  21460. }
  21461. #endif
  21462. }
  21463. #endif
  21464. #if defined(Vc_IMPL_SSE)
  21465. #ifndef VC_SSE_DEINTERLEAVE_H_
  21466. #define VC_SSE_DEINTERLEAVE_H_
  21467. namespace Vc_VERSIONED_NAMESPACE
  21468. {
  21469. namespace Detail
  21470. {
  21471. template <typename A>
  21472. inline void deinterleave(SSE::float_v &, SSE::float_v &, const float *, A);
  21473. template <typename A>
  21474. inline void deinterleave(SSE::float_v &, SSE::float_v &, const short *, A);
  21475. template <typename A>
  21476. inline void deinterleave(SSE::float_v &, SSE::float_v &, const ushort *, A);
  21477. template <typename A>
  21478. inline void deinterleave(SSE::double_v &, SSE::double_v &, const double *, A);
  21479. template <typename A>
  21480. inline void deinterleave(SSE::int_v &, SSE::int_v &, const int *, A);
  21481. template <typename A>
  21482. inline void deinterleave(SSE::int_v &, SSE::int_v &, const short *, A);
  21483. template <typename A>
  21484. inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const uint *, A);
  21485. template <typename A>
  21486. inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const ushort *, A);
  21487. template <typename A>
  21488. inline void deinterleave(SSE::short_v &, SSE::short_v &, const short *, A);
  21489. template <typename A>
  21490. inline void deinterleave(SSE::ushort_v &, SSE::ushort_v &, const ushort *, A);
  21491. Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
  21492. Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
  21493. Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
  21494. Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
  21495. Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
  21496. }
  21497. }
  21498. namespace Vc_VERSIONED_NAMESPACE
  21499. {
  21500. namespace SSE
  21501. {
  21502. inline void deinterleave(Vector<float> &a, Vector<float> &b)
  21503. {
  21504. const __m128 tmp0 = _mm_unpacklo_ps(a.data(), b.data());
  21505. const __m128 tmp1 = _mm_unpackhi_ps(a.data(), b.data());
  21506. a.data() = _mm_unpacklo_ps(tmp0, tmp1);
  21507. b.data() = _mm_unpackhi_ps(tmp0, tmp1);
  21508. }
  21509. inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<short>::AsArg tmp)
  21510. {
  21511. a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
  21512. b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16));
  21513. }
  21514. inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<unsigned short>::AsArg tmp)
  21515. {
  21516. a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
  21517. b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16));
  21518. }
  21519. inline void deinterleave(Vector<double> &a, Vector<double> &b)
  21520. {
  21521. __m128d tmp = _mm_unpacklo_pd(a.data(), b.data());
  21522. b.data() = _mm_unpackhi_pd(a.data(), b.data());
  21523. a.data() = tmp;
  21524. }
  21525. inline void deinterleave(Vector<int> &a, Vector<int> &b)
  21526. {
  21527. const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
  21528. const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
  21529. a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
  21530. b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
  21531. }
  21532. inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b)
  21533. {
  21534. const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
  21535. const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
  21536. a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
  21537. b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
  21538. }
  21539. inline void deinterleave(Vector<short> &a, Vector<short> &b)
  21540. {
  21541. __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
  21542. __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
  21543. __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  21544. __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  21545. a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
  21546. b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
  21547. }
  21548. inline void deinterleave(Vector<unsigned short> &a, Vector<unsigned short> &b)
  21549. {
  21550. __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
  21551. __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
  21552. __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
  21553. __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
  21554. a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
  21555. b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
  21556. }
  21557. inline void deinterleave(Vector<int> &a, Vector<int> &b, Vector<short>::AsArg tmp)
  21558. {
  21559. a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
  21560. b.data() = _mm_srai_epi32(tmp.data(), 16);
  21561. }
  21562. inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b, Vector<unsigned short>::AsArg tmp)
  21563. {
  21564. a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
  21565. b.data() = _mm_srli_epi32(tmp.data(), 16);
  21566. }
  21567. }
  21568. }
  21569. namespace Vc_VERSIONED_NAMESPACE
  21570. {
  21571. namespace Detail
  21572. {
  21573. template<typename A> inline void deinterleave(
  21574. SSE::float_v &a, SSE::float_v &b, const float *m, A align)
  21575. {
  21576. a.load(m, align);
  21577. b.load(m + SSE::float_v::Size, align);
  21578. Vc::SSE::deinterleave(a, b);
  21579. }
  21580. template<typename A> inline void deinterleave(
  21581. SSE::float_v &a, SSE::float_v &b, const short *m, A align)
  21582. {
  21583. SSE::short_v tmp(m, align);
  21584. Vc::SSE::deinterleave(a, b, tmp);
  21585. }
  21586. template<typename A> inline void deinterleave(
  21587. SSE::float_v &a, SSE::float_v &b, const unsigned short *m, A align)
  21588. {
  21589. SSE::ushort_v tmp(m, align);
  21590. Vc::SSE::deinterleave(a, b, tmp);
  21591. }
  21592. template<typename A> inline void deinterleave(
  21593. SSE::double_v &a, SSE::double_v &b, const double *m, A align)
  21594. {
  21595. a.load(m, align);
  21596. b.load(m + SSE::double_v::Size, align);
  21597. Vc::SSE::deinterleave(a, b);
  21598. }
  21599. template<typename A> inline void deinterleave(
  21600. SSE::int_v &a, SSE::int_v &b, const int *m, A align)
  21601. {
  21602. a.load(m, align);
  21603. b.load(m + SSE::int_v::Size, align);
  21604. Vc::SSE::deinterleave(a, b);
  21605. }
  21606. template<typename A> inline void deinterleave(
  21607. SSE::int_v &a, SSE::int_v &b, const short *m, A align)
  21608. {
  21609. SSE::short_v tmp(m, align);
  21610. Vc::SSE::deinterleave(a, b, tmp);
  21611. }
  21612. template<typename A> inline void deinterleave(
  21613. SSE::uint_v &a, SSE::uint_v &b, const unsigned int *m, A align)
  21614. {
  21615. a.load(m, align);
  21616. b.load(m + SSE::uint_v::Size, align);
  21617. Vc::SSE::deinterleave(a, b);
  21618. }
  21619. template<typename A> inline void deinterleave(
  21620. SSE::uint_v &a, SSE::uint_v &b, const unsigned short *m, A align)
  21621. {
  21622. SSE::ushort_v tmp(m, align);
  21623. Vc::SSE::deinterleave(a, b, tmp);
  21624. }
  21625. template<typename A> inline void deinterleave(
  21626. SSE::short_v &a, SSE::short_v &b, const short *m, A align)
  21627. {
  21628. a.load(m, align);
  21629. b.load(m + SSE::short_v::Size, align);
  21630. Vc::SSE::deinterleave(a, b);
  21631. }
  21632. template<typename A> inline void deinterleave(
  21633. SSE::ushort_v &a, SSE::ushort_v &b, const unsigned short *m, A align)
  21634. {
  21635. a.load(m, align);
  21636. b.load(m + SSE::ushort_v::Size, align);
  21637. Vc::SSE::deinterleave(a, b);
  21638. }
  21639. }
  21640. }
  21641. #ifndef VC_SSE_PREFETCHES_TCC_
  21642. #define VC_SSE_PREFETCHES_TCC_
  21643. namespace Vc_VERSIONED_NAMESPACE
  21644. {
  21645. namespace Detail
  21646. {
  21647. Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Sse)
  21648. {
  21649. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
  21650. }
  21651. Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Sse)
  21652. {
  21653. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
  21654. }
  21655. Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Sse)
  21656. {
  21657. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
  21658. }
  21659. Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Sse)
  21660. {
  21661. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
  21662. }
  21663. Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Sse)
  21664. {
  21665. #ifdef __3dNOW__
  21666. _m_prefetchw(const_cast<void *>(addr));
  21667. #else
  21668. _mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
  21669. #endif
  21670. }
  21671. }
  21672. }
  21673. #endif
  21674. #endif
  21675. #ifndef VC_SSE_MATH_H_
  21676. #define VC_SSE_MATH_H_
  21677. #ifndef VC_SSE_CONST_H_
  21678. #define VC_SSE_CONST_H_
  21679. namespace Vc_VERSIONED_NAMESPACE
  21680. {
  21681. namespace SSE
  21682. {
  21683. template<typename T> struct Const
  21684. {
  21685. typedef Vector<T> V;
  21686. typedef Mask<T> M;
  21687. enum Constants { Stride = 16 / sizeof(T) };
  21688. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig<T>::data[0 * Stride]); }
  21689. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig<T>::data[1 * Stride]); }
  21690. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig<T>::data[2 * Stride]); }
  21691. static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig<T>::data[3 * Stride]); }
  21692. static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig<T>::data[4 * Stride]); }
  21693. static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig<T>::data[5 * Stride]); }
  21694. static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig<T>::data[(12 + i) * Stride]); }
  21695. static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig<T>::data[(17 + i) * Stride]); }
  21696. static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig<T>::data[22 * Stride]); }
  21697. static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig<T>::data[23 * Stride]); }
  21698. static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig<T>::data[24 * Stride]); }
  21699. static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig<T>::data[8 * Stride]); }
  21700. static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig<T>::data[9 * Stride]); }
  21701. static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig<T>::data[10 * Stride]); }
  21702. static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig<T>::data[11 * Stride]); }
  21703. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig<T>::data[(28 + i) * Stride]); }
  21704. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig<T>::data[(33 + i) * Stride]); }
  21705. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig<T>::data[(37 + i) * Stride]); }
  21706. static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig<T>::data[(43 + i) * Stride]); }
  21707. static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig<T>::data[25 * Stride]); }
  21708. static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig<T>::data[26 * Stride]); }
  21709. static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log<T>::d(1)).data()); }
  21710. static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log<T>::d(18)); }
  21711. static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log<T>::d(15)); }
  21712. static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log<T>::d(2 + i)); }
  21713. static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log<T>::d(8 + i)); }
  21714. static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log<T>::d(14)); }
  21715. static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log<T>::d(17)); }
  21716. static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log<T>::d(16)); }
  21717. static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log<T>::d(13)); }
  21718. static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log<T>::d(19)); }
  21719. static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log<T>::d(20)); }
  21720. static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
  21721. static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
  21722. private:
  21723. static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R;
  21724. };
  21725. template<typename T> Vc_ALWAYS_INLINE Vc_CONST Vector<T> Const<T>::load(const T *mem) { return V(mem); }
  21726. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
  21727. {
  21728. return Vector<float>(reinterpret_cast<const float *>(&c_general::highMaskFloat));
  21729. }
  21730. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
  21731. {
  21732. return Vector<double>(
  21733. reinterpret_cast<const double *>(&c_general::highMaskDouble));
  21734. }
  21735. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
  21736. {
  21737. return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
  21738. }
  21739. template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
  21740. {
  21741. return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
  21742. }
  21743. }
  21744. }
  21745. #endif
  21746. namespace Vc_VERSIONED_NAMESPACE
  21747. {
  21748. Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
  21749. {
  21750. return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
  21751. _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
  21752. }
  21753. Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
  21754. {
  21755. return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
  21756. _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
  21757. }
  21758. inline SSE::double_v frexp(const SSE::double_v &v,
  21759. SimdArray<int, 2, Scalar::int_v, 1> *e)
  21760. {
  21761. const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
  21762. const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
  21763. SSE::int_v exponent =
  21764. _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
  21765. const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
  21766. SSE::double_v ret = _mm_and_pd(
  21767. exponentMaximized,
  21768. _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
  21769. SSE::double_m zeroMask = v == SSE::double_v::Zero();
  21770. ret(isnan(v) || !isfinite(v) || zeroMask) = v;
  21771. exponent.setZero(zeroMask.data());
  21772. (*e)[0] = exponent[0];
  21773. (*e)[1] = exponent[2];
  21774. return ret;
  21775. }
  21776. inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
  21777. {
  21778. const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
  21779. const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
  21780. internal_data(*e) =
  21781. _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
  21782. const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
  21783. SSE::float_v ret =
  21784. _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
  21785. ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
  21786. e->setZero(v == SSE::float_v::Zero());
  21787. return ret;
  21788. }
  21789. inline SSE::double_v ldexp(SSE::double_v::AsArg v,
  21790. const SimdArray<int, 2, Scalar::int_v, 1> &_e)
  21791. {
  21792. SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
  21793. e.setZero((v == SSE::double_v::Zero()).dataI());
  21794. const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
  21795. return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
  21796. }
  21797. inline SSE::float_v ldexp(SSE::float_v::AsArg v,
  21798. const SimdArray<int, 4, SSE::int_v, 4> &_e)
  21799. {
  21800. SSE::int_v e = internal_data(_e);
  21801. e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
  21802. return reinterpret_components_cast<SSE::float_v>(
  21803. reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
  21804. }
  21805. #ifdef Vc_IMPL_SSE4_1
  21806. inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
  21807. inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
  21808. inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
  21809. inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
  21810. inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
  21811. inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
  21812. #else
  21813. inline SSE::Vector<float> trunc(SSE::Vector<float> x)
  21814. {
  21815. const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
  21816. const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
  21817. _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
  21818. _mm_set1_epi32(0x4b000000)));
  21819. return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
  21820. _mm_and_ps(no_fractional_values, truncated));
  21821. }
  21822. inline SSE::Vector<double> trunc(SSE::Vector<double> x)
  21823. {
  21824. const auto abs_x = Vc::abs(x).data();
  21825. const auto min_no_fractional_bits =
  21826. _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));
  21827. __m128d truncated =
  21828. _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
  21829. truncated = _mm_sub_pd(truncated,
  21830. _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
  21831. return _mm_or_pd(
  21832. _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
  21833. truncated);
  21834. }
  21835. template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
  21836. {
  21837. auto y = trunc(x);
  21838. y(!(y == x) && x < 0) -= 1;
  21839. return y;
  21840. }
  21841. template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
  21842. {
  21843. auto y = trunc(x);
  21844. y(!(y == x || x < 0)) += 1;
  21845. return y;
  21846. }
  21847. #endif
  21848. template <typename T>
  21849. Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
  21850. Vector<T, VectorAbi::Sse> b,
  21851. Vector<T, VectorAbi::Sse> c)
  21852. {
  21853. SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
  21854. return a;
  21855. }
  21856. }
  21857. #endif
  21858. #ifndef Vc_SSE_SIMD_CAST_CALLER_TCC_
  21859. #define Vc_SSE_SIMD_CAST_CALLER_TCC_
  21860. namespace Vc_VERSIONED_NAMESPACE
  21861. {
  21862. #if Vc_IS_VERSION_1
  21863. template <typename T>
  21864. template <typename U>
  21865. Vc_INTRINSIC Mask<T, VectorAbi::Sse>::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
  21866. : Mask(Vc::simd_cast<Mask>(std::forward<U>(rhs)))
  21867. {
  21868. }
  21869. #endif
  21870. }
  21871. #endif
  21872. #endif
  21873. #if defined(Vc_IMPL_AVX)
  21874. #ifndef VC_AVX_HELPERIMPL_H_
  21875. #define VC_AVX_HELPERIMPL_H_
  21876. namespace Vc_VERSIONED_NAMESPACE
  21877. {
  21878. namespace Detail
  21879. {
  21880. template <typename A>
  21881. inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A);
  21882. template <typename A>
  21883. inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A);
  21884. template <typename A>
  21885. inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A);
  21886. template <typename A>
  21887. inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A);
  21888. template <typename A>
  21889. inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A);
  21890. template <typename A>
  21891. inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A);
  21892. template <typename A>
  21893. inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A);
  21894. template <typename A>
  21895. inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A);
  21896. template <typename A>
  21897. inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A);
  21898. template <typename A>
  21899. inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A);
  21900. template <typename T, typename M, typename A>
  21901. Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
  21902. AVX2::Vector<T> &Vc_RESTRICT b,
  21903. AVX2::Vector<T> &Vc_RESTRICT c,
  21904. const M *Vc_RESTRICT memory,
  21905. A align) Vc_ALWAYS_INLINE_R;
  21906. template <typename T, typename M, typename A>
  21907. Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
  21908. AVX2::Vector<T> &Vc_RESTRICT b,
  21909. AVX2::Vector<T> &Vc_RESTRICT c,
  21910. AVX2::Vector<T> &Vc_RESTRICT d,
  21911. const M *Vc_RESTRICT memory,
  21912. A align) Vc_ALWAYS_INLINE_R;
  21913. template <typename T, typename M, typename A>
  21914. Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
  21915. AVX2::Vector<T> &Vc_RESTRICT b,
  21916. AVX2::Vector<T> &Vc_RESTRICT c,
  21917. AVX2::Vector<T> &Vc_RESTRICT d,
  21918. AVX2::Vector<T> &Vc_RESTRICT e,
  21919. const M *Vc_RESTRICT memory,
  21920. A align) Vc_ALWAYS_INLINE_R;
  21921. template <typename T, typename M, typename A>
  21922. Vc_ALWAYS_INLINE_L void deinterleave(
  21923. AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
  21924. AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
  21925. AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
  21926. const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
  21927. template <typename T, typename M, typename A>
  21928. Vc_ALWAYS_INLINE_L void deinterleave(
  21929. AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
  21930. AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
  21931. AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
  21932. AVX2::Vector<T> &Vc_RESTRICT g, AVX2::Vector<T> &Vc_RESTRICT h,
  21933. const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
  21934. Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx)
  21935. {
  21936. prefetchForOneRead(addr, VectorAbi::Sse());
  21937. }
  21938. Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx)
  21939. {
  21940. prefetchForModify(addr, VectorAbi::Sse());
  21941. }
  21942. Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx)
  21943. {
  21944. prefetchClose(addr, VectorAbi::Sse());
  21945. }
  21946. Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx)
  21947. {
  21948. prefetchMid(addr, VectorAbi::Sse());
  21949. }
  21950. Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx)
  21951. {
  21952. prefetchFar(addr, VectorAbi::Sse());
  21953. }
  21954. }
  21955. }
  21956. namespace Vc_VERSIONED_NAMESPACE
  21957. {
  21958. namespace AVX2
  21959. {
  21960. inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
  21961. {
  21962. const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
  21963. const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
  21964. const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
  21965. a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
  21966. b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
  21967. c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
  21968. }
  21969. inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
  21970. {
  21971. const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data());
  21972. const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data());
  21973. m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
  21974. tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1);
  21975. m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
  21976. tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1);
  21977. m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
  21978. tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1);
  21979. a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
  21980. b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
  21981. c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
  21982. }
  21983. inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
  21984. {
  21985. deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
  21986. reinterpret_cast<float_v &>(c));
  21987. }
  21988. inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
  21989. {
  21990. deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
  21991. reinterpret_cast<float_v &>(c));
  21992. }
  21993. inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
  21994. Vector<short> &Vc_RESTRICT )
  21995. {
  21996. return;
  21997. }
  21998. inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
  21999. Vector<unsigned short> &Vc_RESTRICT c)
  22000. {
  22001. deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
  22002. reinterpret_cast<Vector<short> &>(c));
  22003. }
  22004. inline void deinterleave(Vector<float> &a, Vector<float> &b)
  22005. {
  22006. const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data());
  22007. const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data());
  22008. const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
  22009. const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
  22010. a.data() = _mm256_unpacklo_ps(tmp2, tmp3);
  22011. b.data() = _mm256_unpackhi_ps(tmp2, tmp3);
  22012. }
  22013. inline void deinterleave(Vector<short> &a,
  22014. Vector<short> &b)
  22015. {
  22016. auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
  22017. auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
  22018. auto v2 = AVX::unpacklo_epi16(v0, v1);
  22019. auto v3 = AVX::unpackhi_epi16(v0, v1);
  22020. v0 = AVX::unpacklo_epi16(v2, v3);
  22021. v1 = AVX::unpackhi_epi16(v2, v3);
  22022. a.data() = AVX::unpacklo_epi16(v0, v1);
  22023. b.data() = AVX::unpackhi_epi16(v0, v1);
  22024. }
  22025. inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
  22026. {
  22027. auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
  22028. auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
  22029. auto v2 = AVX::unpacklo_epi16(v0, v1);
  22030. auto v3 = AVX::unpackhi_epi16(v0, v1);
  22031. v0 = AVX::unpacklo_epi16(v2, v3);
  22032. v1 = AVX::unpackhi_epi16(v2, v3);
  22033. a.data() = AVX::unpacklo_epi16(v0, v1);
  22034. b.data() = AVX::unpackhi_epi16(v0, v1);
  22035. }
  22036. }
  22037. namespace Detail
  22038. {
  22039. template <typename Flags>
  22040. inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
  22041. {
  22042. a.load(m, align);
  22043. b.load(m + AVX2::float_v::Size, align);
  22044. Vc::AVX2::deinterleave(a, b);
  22045. }
  22046. template <typename Flags>
  22047. inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
  22048. {
  22049. using namespace Vc::AVX2;
  22050. const auto tmp = Detail::load32(m, f);
  22051. a.data() =
  22052. _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
  22053. _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
  22054. b.data() = _mm256_cvtepi32_ps(
  22055. concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
  22056. }
  22057. template <typename Flags>
  22058. inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
  22059. {
  22060. using namespace Vc::AVX2;
  22061. const auto tmp = Detail::load32(m, f);
  22062. a.data() = _mm256_cvtepi32_ps(
  22063. concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
  22064. _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
  22065. b.data() = _mm256_cvtepi32_ps(
  22066. concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
  22067. }
  22068. template <typename Flags>
  22069. inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
  22070. {
  22071. using namespace Vc::AVX2;
  22072. a.load(m, align);
  22073. b.load(m + AVX2::double_v::Size, align);
  22074. m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data());
  22075. m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data());
  22076. a.data() = _mm256_unpacklo_pd(tmp0, tmp1);
  22077. b.data() = _mm256_unpackhi_pd(tmp0, tmp1);
  22078. }
  22079. template <typename Flags>
  22080. inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
  22081. {
  22082. using namespace AVX;
  22083. a.load(m, align);
  22084. b.load(m + AVX2::int_v::Size, align);
  22085. const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
  22086. const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
  22087. const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
  22088. const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
  22089. a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
  22090. b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
  22091. }
  22092. template <typename Flags>
  22093. inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
  22094. {
  22095. using namespace Vc::AVX;
  22096. const AVX2::short_v tmp0(m, f);
  22097. const m256i tmp = tmp0.data();
  22098. a.data() = concat(
  22099. _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
  22100. _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
  22101. b.data() = concat(
  22102. _mm_srai_epi32(lo128(tmp), 16),
  22103. _mm_srai_epi32(hi128(tmp), 16));
  22104. }
  22105. template <typename Flags>
  22106. inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
  22107. {
  22108. using namespace AVX;
  22109. a.load(m, align);
  22110. b.load(m + AVX2::uint_v::Size, align);
  22111. const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
  22112. const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
  22113. const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
  22114. const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
  22115. a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
  22116. b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
  22117. }
  22118. template <typename Flags>
  22119. inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
  22120. {
  22121. using namespace Vc::AVX;
  22122. const AVX2::ushort_v tmp0(m, f);
  22123. const m256i tmp = tmp0.data();
  22124. a.data() = concat(
  22125. _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
  22126. _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
  22127. b.data() = concat(
  22128. _mm_srai_epi32(lo128(tmp), 16),
  22129. _mm_srai_epi32(hi128(tmp), 16));
  22130. }
  22131. template <typename Flags>
  22132. inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
  22133. {
  22134. a.load(m, align);
  22135. b.load(m + AVX2::short_v::Size, align);
  22136. Vc::AVX2::deinterleave(a, b);
  22137. }
  22138. template <typename Flags>
  22139. inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
  22140. {
  22141. a.load(m, align);
  22142. b.load(m + AVX2::ushort_v::Size, align);
  22143. Vc::AVX2::deinterleave(a, b);
  22144. }
  22145. template <typename T, typename M, typename Flags>
  22146. Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
  22147. AVX2::Vector<T> &Vc_RESTRICT b,
  22148. AVX2::Vector<T> &Vc_RESTRICT c,
  22149. const M *Vc_RESTRICT memory, Flags align)
  22150. {
  22151. using V = AVX2::Vector<T>;
  22152. a.load(&memory[0 * V::Size], align);
  22153. b.load(&memory[1 * V::Size], align);
  22154. c.load(&memory[2 * V::Size], align);
  22155. Vc::AVX2::deinterleave(a, b, c);
  22156. }
  22157. }
  22158. }
  22159. #endif
  22160. #ifndef VC_AVX_MATH_H_
  22161. #define VC_AVX_MATH_H_
  22162. namespace Vc_VERSIONED_NAMESPACE
  22163. {
  22164. #ifdef Vc_IMPL_AVX2
  22165. Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); }
  22166. Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); }
  22167. Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); }
  22168. Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
  22169. Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); }
  22170. Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); }
  22171. Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); }
  22172. Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
  22173. #endif
  22174. Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); }
  22175. Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
  22176. Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); }
  22177. Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
  22178. template <typename T>
  22179. Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
  22180. {
  22181. return AVX::VectorHelper<T>::sqrt(x.data());
  22182. }
  22183. template <typename T>
  22184. Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
  22185. {
  22186. return AVX::VectorHelper<T>::rsqrt(x.data());
  22187. }
  22188. template <typename T>
  22189. Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
  22190. {
  22191. return AVX::VectorHelper<T>::reciprocal(x.data());
  22192. }
  22193. template <typename T>
  22194. Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
  22195. {
  22196. return AVX::VectorHelper<T>::round(x.data());
  22197. }
  22198. Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
  22199. {
  22200. return Detail::and_(x.data(), AVX::setabsmask_pd());
  22201. }
  22202. Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
  22203. {
  22204. return Detail::and_(x.data(), AVX::setabsmask_ps());
  22205. }
  22206. #ifdef Vc_IMPL_AVX2
  22207. Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
  22208. {
  22209. return _mm256_abs_epi32(x.data());
  22210. }
  22211. Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
  22212. {
  22213. return _mm256_abs_epi16(x.data());
  22214. }
  22215. #endif
  22216. Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
  22217. {
  22218. return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
  22219. }
  22220. Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
  22221. {
  22222. return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
  22223. }
  22224. Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
  22225. {
  22226. return _mm256_castsi256_pd(AVX::cmpeq_epi64(
  22227. _mm256_castpd_si256(abs(x).data()),
  22228. _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
  22229. }
  22230. Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
  22231. {
  22232. return _mm256_castsi256_ps(
  22233. AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
  22234. _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
  22235. }
  22236. Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
  22237. {
  22238. return AVX::cmpunord_pd(x.data(), x.data());
  22239. }
  22240. Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
  22241. {
  22242. return AVX::cmpunord_ps(x.data(), x.data());
  22243. }
  22244. Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
  22245. {
  22246. return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
  22247. _mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
  22248. }
  22249. Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
  22250. AVX2::double_v::AsArg sign)
  22251. {
  22252. return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
  22253. _mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
  22254. }
  22255. inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
  22256. {
  22257. const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
  22258. const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
  22259. auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
  22260. auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
  22261. lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
  22262. hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
  22263. SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
  22264. const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
  22265. AVX2::double_v ret =
  22266. _mm256_and_pd(exponentMaximized,
  22267. _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
  22268. const double_m zeroMask = v == AVX2::double_v::Zero();
  22269. ret(isnan(v) || !isfinite(v) || zeroMask) = v;
  22270. exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
  22271. internal_data(*e) = exponent;
  22272. return ret;
  22273. }
  22274. #ifdef Vc_IMPL_AVX2
  22275. inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
  22276. {
  22277. const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
  22278. const __m256d w[2] = {internal_data(internal_data0(v)).data(),
  22279. internal_data(internal_data1(v)).data()};
  22280. const __m256i exponentPart[2] = {
  22281. _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
  22282. _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
  22283. const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
  22284. _mm256_set1_epi32(0x3fe));
  22285. const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
  22286. _mm256_set1_epi32(0x3fe));
  22287. const __m256i a = _mm256_unpacklo_epi32(lo, hi);
  22288. const __m256i b = _mm256_unpackhi_epi32(lo, hi);
  22289. const __m256i tmp = _mm256_unpacklo_epi32(a, b);
  22290. const __m256i exponent =
  22291. AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
  22292. _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp)));
  22293. const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
  22294. _mm256_or_pd(w[1], exponentBits)};
  22295. const auto frexpMask =
  22296. _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
  22297. fixed_size_simd<double, 8> ret = {
  22298. fixed_size_simd<double, 4>(
  22299. AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
  22300. fixed_size_simd<double, 4>(
  22301. AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
  22302. const auto zeroMask = v == v.Zero();
  22303. ret(isnan(v) || !isfinite(v) || zeroMask) = v;
  22304. internal_data(*e) =
  22305. Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
  22306. return ret;
  22307. }
  22308. #endif
  22309. namespace Detail
  22310. {
  22311. Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
  22312. {
  22313. SimdArray<uint, float_v::Size> exponentPart;
  22314. const auto ee = AVX::avx_cast<__m256i>(e);
  22315. #ifdef Vc_IMPL_AVX2
  22316. exponentPart = AVX2::uint_v(ee);
  22317. #else
  22318. internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
  22319. internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
  22320. #endif
  22321. return (exponentPart >> 23) - 0x7e;
  22322. }
  22323. }
  22324. inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
  22325. {
  22326. using namespace Detail;
  22327. using namespace AVX2;
  22328. const __m256 exponentBits = Const<float>::exponentMask().data();
  22329. *e = extractExponent(and_(v.data(), exponentBits));
  22330. const __m256 exponentMaximized = or_(v.data(), exponentBits);
  22331. AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
  22332. ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
  22333. e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
  22334. return ret;
  22335. }
  22336. inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
  22337. {
  22338. SSE::int_v e = internal_data(_e);
  22339. e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
  22340. const __m256i exponentBits =
  22341. AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
  22342. _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
  22343. return AVX::avx_cast<__m256d>(
  22344. AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
  22345. }
  22346. inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
  22347. {
  22348. e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
  22349. e <<= 23;
  22350. #ifdef Vc_IMPL_AVX2
  22351. return {AVX::avx_cast<__m256>(
  22352. AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
  22353. AVX::lo128(internal_data(e).data())),
  22354. _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
  22355. AVX::hi128(internal_data(e).data()))))};
  22356. #else
  22357. return {AVX::avx_cast<__m256>(
  22358. AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
  22359. internal_data(internal_data0(e)).data()),
  22360. _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
  22361. internal_data(internal_data1(e)).data())))};
  22362. #endif
  22363. }
  22364. Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
  22365. {
  22366. return _mm256_round_ps(v.data(), 0x3);
  22367. }
  22368. Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
  22369. {
  22370. return _mm256_round_pd(v.data(), 0x3);
  22371. }
  22372. Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
  22373. {
  22374. return _mm256_floor_ps(v.data());
  22375. }
  22376. Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
  22377. {
  22378. return _mm256_floor_pd(v.data());
  22379. }
  22380. Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
  22381. {
  22382. return _mm256_ceil_ps(v.data());
  22383. }
  22384. Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
  22385. {
  22386. return _mm256_ceil_pd(v.data());
  22387. }
  22388. template <typename T>
  22389. Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
  22390. Vector<T, VectorAbi::Avx> b,
  22391. Vector<T, VectorAbi::Avx> c)
  22392. {
  22393. return Detail::fma(a.data(), b.data(), c.data(), T());
  22394. }
  22395. }
  22396. #endif
  22397. #ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_
  22398. #define Vc_AVX_SIMD_CAST_CALLER_TCC_
  22399. namespace Vc_VERSIONED_NAMESPACE
  22400. {
  22401. #if Vc_IS_VERSION_1
  22402. template <typename T>
  22403. template <typename U, typename>
  22404. Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(U &&x)
  22405. : d(simd_cast<Vector>(std::forward<U>(x)).data())
  22406. {
  22407. }
  22408. template <typename T>
  22409. template <typename U>
  22410. Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
  22411. Common::enable_if_mask_converts_explicitly<T, U>)
  22412. : Mask(simd_cast<Mask>(std::forward<U>(rhs)))
  22413. {
  22414. }
  22415. #endif
  22416. }
  22417. #endif
  22418. #endif
  22419. #ifndef VC_COMMON_MATH_H_
  22420. #define VC_COMMON_MATH_H_
  22421. #define Vc_COMMON_MATH_H_INTERNAL 1
  22422. #ifndef VC_COMMON_TRIGONOMETRIC_H_
  22423. #define VC_COMMON_TRIGONOMETRIC_H_
  22424. #ifdef Vc_HAVE_LIBMVEC
  22425. extern "C" {
  22426. __m128 _ZGVbN4v_sinf(__m128);
  22427. __m128d _ZGVbN2v_sin(__m128d);
  22428. __m128 _ZGVbN4v_cosf(__m128);
  22429. __m128d _ZGVbN2v_cos(__m128d);
  22430. __m256 _ZGVdN8v_sinf(__m256);
  22431. __m256d _ZGVdN4v_sin(__m256d);
  22432. __m256 _ZGVdN8v_cosf(__m256);
  22433. __m256d _ZGVdN4v_cos(__m256d);
  22434. }
  22435. #endif
  22436. namespace Vc_VERSIONED_NAMESPACE
  22437. {
  22438. namespace Detail
  22439. {
  22440. template<Vc::Implementation Impl> struct MapImpl { enum Dummy { Value = Impl }; };
  22441. template<> struct MapImpl<Vc::SSE42Impl> { enum Dummy { Value = MapImpl<Vc::SSE41Impl>::Value }; };
  22442. template<Vc::Implementation Impl> using TrigonometricImplementation =
  22443. ImplementationT<MapImpl<Impl>::Value
  22444. #if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4)
  22445. + Vc::XopInstructions
  22446. + Vc::Fma4Instructions
  22447. #endif
  22448. >;
  22449. }
  22450. namespace Common
  22451. {
  22452. template<typename Impl> struct Trigonometric
  22453. {
  22454. template<typename T> static T sin(const T &_x);
  22455. template<typename T> static T cos(const T &_x);
  22456. template<typename T> static void sincos(const T &_x, T *_sin, T *_cos);
  22457. template<typename T> static T asin (const T &_x);
  22458. template<typename T> static T atan (const T &_x);
  22459. template<typename T> static T atan2(const T &y, const T &x);
  22460. };
  22461. }
  22462. #if defined Vc_IMPL_SSE || defined DOXYGEN
  22463. namespace Detail
  22464. {
  22465. template <typename T, typename Abi>
  22466. using Trig = Common::Trigonometric<Detail::TrigonometricImplementation<
  22467. (std::is_same<Abi, VectorAbi::Sse>::value
  22468. ? SSE42Impl
  22469. : std::is_same<Abi, VectorAbi::Avx>::value ? AVXImpl : ScalarImpl)>>;
  22470. }
  22471. #ifdef Vc_HAVE_LIBMVEC
  22472. Vc_INTRINSIC __m128 sin_dispatch(__m128 x) { return ::_ZGVbN4v_sinf(x); }
  22473. Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); }
  22474. Vc_INTRINSIC __m128 cos_dispatch(__m128 x) { return ::_ZGVbN4v_cosf(x); }
  22475. Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); }
  22476. #ifdef Vc_IMPL_AVX
  22477. Vc_INTRINSIC __m256 sin_dispatch(__m256 x) { return ::_ZGVdN8v_sinf(x); }
  22478. Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); }
  22479. Vc_INTRINSIC __m256 cos_dispatch(__m256 x) { return ::_ZGVdN8v_cosf(x); }
  22480. Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); }
  22481. #endif
  22482. template <typename T, typename Abi>
  22483. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
  22484. {
  22485. return sin_dispatch(x.data());
  22486. }
  22487. template <typename T, typename Abi>
  22488. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
  22489. {
  22490. return cos_dispatch(x.data());
  22491. }
  22492. #else
  22493. template <typename T, typename Abi>
  22494. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
  22495. {
  22496. return Detail::Trig<T, Abi>::sin(x);
  22497. }
  22498. template <typename T, typename Abi>
  22499. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
  22500. {
  22501. return Detail::Trig<T, Abi>::cos(x);
  22502. }
  22503. #endif
  22504. template <typename T, typename Abi>
  22505. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> asin(const Vector<T, Abi> &x)
  22506. {
  22507. return Detail::Trig<T, Abi>::asin(x);
  22508. }
  22509. template <typename T, typename Abi>
  22510. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan(const Vector<T, Abi> &x)
  22511. {
  22512. return Detail::Trig<T, Abi>::atan(x);
  22513. }
  22514. template <typename T, typename Abi>
  22515. Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan2(const Vector<T, Abi> &y,
  22516. const Vector<T, Abi> &x)
  22517. {
  22518. return Detail::Trig<T, Abi>::atan2(y, x);
  22519. }
  22520. template <typename T, typename Abi>
  22521. Vc_INTRINSIC void sincos(const Vector<T, Abi> &x,
  22522. Vector<T, detail::not_fixed_size_abi<Abi>> *sin,
  22523. Vector<T, Abi> *cos)
  22524. {
  22525. Detail::Trig<T, Abi>::sincos(x, sin, cos);
  22526. }
  22527. #endif
  22528. }
  22529. #endif
  22530. #ifndef VC_COMMON_CONST_H_
  22531. #define VC_COMMON_CONST_H_
  22532. #include <type_traits>
  22533. namespace Vc_VERSIONED_NAMESPACE
  22534. {
  22535. namespace Detail
  22536. {
  22537. template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, true>);
  22538. template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, false>);
  22539. template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, true>)
  22540. {
  22541. return 1.;
  22542. }
  22543. template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, false>)
  22544. {
  22545. return 1.;
  22546. }
  22547. template <> constexpr double exponentToFloat<-32>(std::integral_constant<bool, true>)
  22548. {
  22549. return 1. / (65536. * 65536.);
  22550. }
  22551. template <> constexpr double exponentToFloat<32>(std::integral_constant<bool, false>)
  22552. {
  22553. return 65536. * 65536.;
  22554. }
  22555. template <> constexpr double exponentToFloat<-64>(std::integral_constant<bool, true>)
  22556. {
  22557. return 1. / (65536. * 65536. * 65536. * 65536.);
  22558. }
  22559. template <> constexpr double exponentToFloat<64>(std::integral_constant<bool, false>)
  22560. {
  22561. return 65536. * 65536. * 65536. * 65536.;
  22562. }
  22563. template <int exponent>
  22564. constexpr double exponentToFloat(std::integral_constant<bool, false> negative)
  22565. {
  22566. return exponentToFloat<exponent - 1>(negative) * 2.0;
  22567. }
  22568. template <int exponent>
  22569. constexpr double exponentToFloat(std::integral_constant<bool, true> negative)
  22570. {
  22571. return exponentToFloat<exponent + 1>(negative) * 0.5;
  22572. }
  22573. template <int sign, unsigned long long mantissa, int exponent> constexpr double doubleConstant()
  22574. {
  22575. return (static_cast<double>((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) /
  22576. 0x0010000000000000ull) *
  22577. exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>()) * sign;
  22578. }
  22579. template <int sign, unsigned int mantissa, int exponent> constexpr float floatConstant()
  22580. {
  22581. return (static_cast<float>((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) *
  22582. static_cast<float>(
  22583. exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>())) *
  22584. sign;
  22585. }
  22586. }
  22587. }
  22588. #endif
  22589. namespace Vc_VERSIONED_NAMESPACE
  22590. {
  22591. template <class T, class Abi>
  22592. SimdArray<int, Vector<T, Abi>::size()> fpclassify(const Vector<T, Abi> &x)
  22593. {
  22594. return SimdArray<int, Vector<T, Abi>::size()>(
  22595. [&](std::size_t i) { return std::fpclassify(x[i]); });
  22596. }
  22597. template <class T, size_t N> SimdArray<int, N> fpclassify(const SimdArray<T, N> &x)
  22598. {
  22599. return SimdArray<int, N>([&](std::size_t i) { return std::fpclassify(x[i]); });
  22600. }
  22601. #ifdef Vc_IMPL_SSE
  22602. #ifdef Vc_COMMON_MATH_H_INTERNAL
  22603. enum LogarithmBase {
  22604. BaseE, Base10, Base2
  22605. };
  22606. namespace Detail
  22607. {
  22608. template <typename T, typename Abi>
  22609. using Const = typename std::conditional<std::is_same<Abi, VectorAbi::Avx>::value,
  22610. AVX::Const<T>, SSE::Const<T>>::type;
  22611. template<LogarithmBase Base>
  22612. struct LogImpl
  22613. {
  22614. template<typename T, typename Abi> static Vc_ALWAYS_INLINE void log_series(Vector<T, Abi> &Vc_RESTRICT x, typename Vector<T, Abi>::AsArg exponent) {
  22615. typedef Vector<T, Abi> V;
  22616. typedef Detail::Const<T, Abi> C;
  22617. const V x2 = x * x;
  22618. #ifdef Vc_LOG_ILP
  22619. V y2 = (C::P(6) * x2 + C::P(7) * x) + C::P(8);
  22620. V y0 = (C::P(0) * x2 + C::P(1) * x) + C::P(2);
  22621. V y1 = (C::P(3) * x2 + C::P(4) * x) + C::P(5);
  22622. const V x3 = x2 * x;
  22623. const V x6 = x3 * x3;
  22624. const V x9 = x6 * x3;
  22625. V y = (y0 * x9 + y1 * x6) + y2 * x3;
  22626. #elif defined Vc_LOG_ILP2
  22627. const V x3 = x2 * x;
  22628. const V x4 = x2 * x2;
  22629. const V x5 = x2 * x3;
  22630. const V x6 = x3 * x3;
  22631. const V x7 = x4 * x3;
  22632. const V x8 = x4 * x4;
  22633. const V x9 = x5 * x4;
  22634. const V x10 = x5 * x5;
  22635. const V x11 = x5 * x6;
  22636. V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7
  22637. + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3;
  22638. #else
  22639. V y = C::P(0);
  22640. Vc::Common::unrolled_loop<int, 1, 9>([&](int i) { y = y * x + C::P(i); });
  22641. y *= x * x2;
  22642. #endif
  22643. switch (Base) {
  22644. case BaseE:
  22645. y += exponent * C::ln2_small();
  22646. y -= x2 * C::_1_2();
  22647. x += y;
  22648. x += exponent * C::ln2_large();
  22649. break;
  22650. case Base10:
  22651. y += exponent * C::ln2_small();
  22652. y -= x2 * C::_1_2();
  22653. x += y;
  22654. x += exponent * C::ln2_large();
  22655. x *= C::log10_e();
  22656. break;
  22657. case Base2:
  22658. {
  22659. const V x_ = x;
  22660. x *= C::log2_e();
  22661. y *= C::log2_e();
  22662. y -= x_ * x * C::_1_2();
  22663. x += y;
  22664. x += exponent;
  22665. break;
  22666. }
  22667. }
  22668. }
  22669. template <typename Abi>
  22670. static Vc_ALWAYS_INLINE void log_series(Vector<double, Abi> &Vc_RESTRICT x,
  22671. typename Vector<double, Abi>::AsArg exponent)
  22672. {
  22673. typedef Vector<double, Abi> V;
  22674. typedef Detail::Const<double, Abi> C;
  22675. const V x2 = x * x;
  22676. V y = C::P(0);
  22677. V y2 = C::Q(0) + x;
  22678. Vc::Common::unrolled_loop<int, 1, 5>([&](int i) {
  22679. y = y * x + C::P(i);
  22680. y2 = y2 * x + C::Q(i);
  22681. });
  22682. y2 = x / y2;
  22683. y = y * x + C::P(5);
  22684. y = x2 * y * y2;
  22685. switch (Base) {
  22686. case BaseE:
  22687. y += exponent * C::ln2_small();
  22688. y -= x2 * C::_1_2();
  22689. x += y;
  22690. x += exponent * C::ln2_large();
  22691. break;
  22692. case Base10:
  22693. y += exponent * C::ln2_small();
  22694. y -= x2 * C::_1_2();
  22695. x += y;
  22696. x += exponent * C::ln2_large();
  22697. x *= C::log10_e();
  22698. break;
  22699. case Base2:
  22700. {
  22701. const V x_ = x;
  22702. x *= C::log2_e();
  22703. y *= C::log2_e();
  22704. y -= x_ * x * C::_1_2();
  22705. x += y;
  22706. x += exponent;
  22707. break;
  22708. }
  22709. }
  22710. }
  22711. template <typename T, typename Abi, typename V = Vector<T, Abi>>
  22712. static inline Vector<T, Abi> calc(V _x)
  22713. {
  22714. typedef typename V::Mask M;
  22715. typedef Detail::Const<T, Abi> C;
  22716. V x(_x);
  22717. const M invalidMask = x < V::Zero();
  22718. const M infinityMask = x == V::Zero();
  22719. const M denormal = x <= C::min();
  22720. x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>());
  22721. V exponent = Detail::exponent(x.data());
  22722. exponent(denormal) -= 54;
  22723. x.setZero(C::exponentMask());
  22724. x = Detail::operator|(x,
  22725. C::_1_2());
  22726. const M smallX = x < C::_1_sqrt2();
  22727. x(smallX) += x;
  22728. x -= V::One();
  22729. exponent(!smallX) += V::One();
  22730. log_series(x, exponent);
  22731. x.setQnan(invalidMask);
  22732. x(infinityMask) = C::neginf();
  22733. return x;
  22734. }
  22735. };
  22736. }
  22737. template <typename T, typename Abi>
  22738. Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log(
  22739. const Vector<T, Abi> &x)
  22740. {
  22741. return Detail::LogImpl<BaseE>::calc<T, Abi>(x);
  22742. }
  22743. template <typename T, typename Abi>
  22744. Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log10(
  22745. const Vector<T, Abi> &x)
  22746. {
  22747. return Detail::LogImpl<Base10>::calc<T, Abi>(x);
  22748. }
  22749. template <typename T, typename Abi>
  22750. Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log2(
  22751. const Vector<T, Abi> &x)
  22752. {
  22753. return Detail::LogImpl<Base2>::calc<T, Abi>(x);
  22754. }
  22755. #endif
  22756. #ifdef Vc_COMMON_MATH_H_INTERNAL
  22757. constexpr float log2_e = 1.44269504088896341f;
  22758. constexpr float MAXLOGF = 88.72283905206835f;
  22759. constexpr float MINLOGF = -103.278929903431851103f;
  22760. constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
  22761. template <typename Abi, typename = enable_if<std::is_same<Abi, VectorAbi::Sse>::value ||
  22762. std::is_same<Abi, VectorAbi::Avx>::value>>
  22763. inline Vector<float, detail::not_fixed_size_abi<Abi>> exp(Vector<float, Abi> x)
  22764. {
  22765. using V = Vector<float, Abi>;
  22766. typedef typename V::Mask M;
  22767. typedef Detail::Const<float, Abi> C;
  22768. const M overflow = x > MAXLOGF;
  22769. const M underflow = x < MINLOGF;
  22770. V z = floor(C::log2_e() * x + 0.5f);
  22771. const auto n = static_cast<Vc::SimdArray<int, V::Size>>(z);
  22772. x -= z * C::ln2_large();
  22773. x -= z * C::ln2_small();
  22774. z = ((((( 1.9875691500E-4f * x
  22775. + 1.3981999507E-3f) * x
  22776. + 8.3334519073E-3f) * x
  22777. + 4.1665795894E-2f) * x
  22778. + 1.6666665459E-1f) * x
  22779. + 5.0000001201E-1f) * (x * x)
  22780. + x
  22781. + 1.0f;
  22782. x = ldexp(z, n);
  22783. x(overflow) = std::numeric_limits<typename V::EntryType>::infinity();
  22784. x.setZero(underflow);
  22785. return x;
  22786. }
  22787. #endif
  22788. #ifdef Vc_IMPL_AVX
  22789. inline AVX::double_v exp(AVX::double_v _x)
  22790. {
  22791. AVX::Vector<double> x = _x;
  22792. typedef AVX::Vector<double> V;
  22793. typedef V::Mask M;
  22794. typedef AVX::Const<double> C;
  22795. const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
  22796. const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
  22797. V px = floor(C::log2_e() * x + 0.5);
  22798. __m128i tmp = _mm256_cvttpd_epi32(px.data());
  22799. const SimdArray<int, V::Size> n = SSE::int_v{tmp};
  22800. x -= px * C::ln2_large();
  22801. x -= px * C::ln2_small();
  22802. const double P[] = {
  22803. Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
  22804. Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
  22805. Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
  22806. };
  22807. const double Q[] = {
  22808. Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
  22809. Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
  22810. Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
  22811. Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
  22812. };
  22813. const V x2 = x * x;
  22814. px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
  22815. x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
  22816. x = V::One() + 2.0 * x;
  22817. x = ldexp(x, n);
  22818. x(overflow) = std::numeric_limits<double>::infinity();
  22819. x.setZero(underflow);
  22820. return x;
  22821. }
  22822. #endif
  22823. inline SSE::double_v exp(SSE::double_v::AsArg _x) {
  22824. SSE::Vector<double> x = _x;
  22825. typedef SSE::Vector<double> V;
  22826. typedef V::Mask M;
  22827. typedef SSE::Const<double> C;
  22828. const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
  22829. const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
  22830. V px = floor(C::log2_e() * x + 0.5);
  22831. SimdArray<int, V::Size> n;
  22832. _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data()));
  22833. x -= px * C::ln2_large();
  22834. x -= px * C::ln2_small();
  22835. const double P[] = {
  22836. Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
  22837. Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
  22838. Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
  22839. };
  22840. const double Q[] = {
  22841. Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
  22842. Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
  22843. Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
  22844. Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
  22845. };
  22846. const V x2 = x * x;
  22847. px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
  22848. x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
  22849. x = V::One() + 2.0 * x;
  22850. x = ldexp(x, n);
  22851. x(overflow) = std::numeric_limits<double>::infinity();
  22852. x.setZero(underflow);
  22853. return x;
  22854. }
  22855. #endif
  22856. }
  22857. #undef Vc_COMMON_MATH_H_INTERNAL
  22858. #endif
  22859. #ifdef isfinite
  22860. #undef isfinite
  22861. #endif
  22862. #ifdef isnan
  22863. #undef isnan
  22864. #endif
  22865. #ifndef VC_COMMON_VECTORTUPLE_H_
  22866. #define VC_COMMON_VECTORTUPLE_H_
  22867. namespace Vc_VERSIONED_NAMESPACE
  22868. {
  22869. namespace Common
  22870. {
  22871. template<size_t StructSize, typename V, typename I, bool Readonly = true> struct InterleavedMemoryReadAccess;
  22872. template <int Length, typename V> class VectorReferenceArray
  22873. {
  22874. typedef typename V::EntryType T;
  22875. typedef V &Vc_RESTRICT Reference;
  22876. std::array<V * Vc_RESTRICT, Length> r;
  22877. typedef make_index_sequence<Length> IndexSequence;
  22878. template <typename VV, std::size_t... Indexes>
  22879. constexpr VectorReferenceArray<Length + 1, VV> appendOneReference(
  22880. VV &a, index_sequence<Indexes...>) const
  22881. {
  22882. return {*r[Indexes]..., a};
  22883. }
  22884. template <typename A, std::size_t... Indexes>
  22885. Vc_INTRINSIC void callDeinterleave(const A &access, index_sequence<Indexes...>) const
  22886. {
  22887. access.deinterleave(*r[Indexes]...);
  22888. }
  22889. public:
  22890. template <typename... Us, typename = enable_if<(sizeof...(Us) == Length)>>
  22891. constexpr VectorReferenceArray(Us &&... args)
  22892. : r{{std::addressof(std::forward<Us>(args))...}}
  22893. {
  22894. }
  22895. template <typename VV, typename = enable_if<!std::is_const<V>::value &&
  22896. std::is_same<VV, V>::value>>
  22897. Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
  22898. Length + 1, V>
  22899. operator,(VV &a) const &&
  22900. {
  22901. return appendOneReference(a, IndexSequence());
  22902. }
  22903. Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
  22904. Length + 1, const V>
  22905. operator,(const V &a) const &&
  22906. {
  22907. return appendOneReference(a, IndexSequence());
  22908. }
  22909. template <size_t StructSize, typename I, bool RO>
  22910. Vc_ALWAYS_INLINE enable_if<(Length <= StructSize), void> operator=(
  22911. const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) &&
  22912. {
  22913. callDeinterleave(access, IndexSequence());
  22914. }
  22915. template <size_t StructSize, typename I, bool RO>
  22916. enable_if<(Length > StructSize), void> operator=(
  22917. const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) && =
  22918. delete;
  22919. template <typename... Inputs> void operator=(TransposeProxy<Inputs...> &&proxy) &&
  22920. {
  22921. transpose_impl(TransposeTag<Length, sizeof...(Inputs)>(), &r[0], proxy);
  22922. }
  22923. template <typename T, typename IndexVector, typename Scale, bool Flag>
  22924. void operator=(SubscriptOperation<T, IndexVector, Scale, Flag> &&sub) &&
  22925. {
  22926. const auto &args = std::move(sub).gatherArguments();
  22927. Common::InterleavedMemoryReadAccess<1, V, Traits::decay<decltype(args.indexes)>>
  22928. deinterleaver(args.address, args.indexes);
  22929. callDeinterleave(deinterleaver, IndexSequence());
  22930. }
  22931. Vc_ALWAYS_INLINE Reference operator[](std::size_t i) { return *r[i]; }
  22932. };
  22933. }
  22934. template <typename T, typename Abi>
  22935. Vc_DEPRECATED("build the tuple with Vc::tie instead")
  22936. constexpr Common::VectorReferenceArray<2, Vc::Vector<T, Abi>>
  22937. operator,(Vc::Vector<T, Abi> &a, Vc::Vector<T, Abi> &b)
  22938. {
  22939. return {a, b};
  22940. }
  22941. template <typename T, typename Abi>
  22942. Vc_DEPRECATED("build the tuple with Vc::tie instead")
  22943. constexpr Common::VectorReferenceArray<2, const Vc::Vector<T, Abi>>
  22944. operator,(const Vc::Vector<T, Abi> &a, const Vc::Vector<T, Abi> &b)
  22945. {
  22946. return {a, b};
  22947. }
  22948. template <typename V, typename... Vs>
  22949. constexpr Common::VectorReferenceArray<sizeof...(Vs) + 1,
  22950. typename std::remove_reference<V>::type>
  22951. tie(V &&a, Vs &&... b)
  22952. {
  22953. return {std::forward<V>(a), std::forward<Vs>(b)...};
  22954. }
  22955. }
  22956. #endif
  22957. #ifndef VC_COMMON_IIF_H_
  22958. #define VC_COMMON_IIF_H_
  22959. #ifndef VC_TYPE_TRAITS_
  22960. #define VC_TYPE_TRAITS_
  22961. #include <type_traits>
  22962. namespace Vc_VERSIONED_NAMESPACE
  22963. {
  22964. using Traits::is_simd_mask;
  22965. using Traits::is_simd_vector;
  22966. using Traits::is_integral;
  22967. using Traits::is_floating_point;
  22968. using Traits::is_arithmetic;
  22969. using Traits::is_signed;
  22970. using Traits::is_unsigned;
  22971. template<typename T>
  22972. struct memory_alignment : public std::integral_constant<size_t, alignof(T)> {};
  22973. template<> struct memory_alignment<short_v> : public std::integral_constant<size_t, short_v::MemoryAlignment> {};
  22974. template<> struct memory_alignment<ushort_v> : public std::integral_constant<size_t, ushort_v::MemoryAlignment> {};
  22975. }
  22976. #endif
  22977. namespace Vc_VERSIONED_NAMESPACE
  22978. {
  22979. template <typename Mask, typename T>
  22980. Vc_ALWAYS_INLINE enable_if<is_simd_mask<Mask>::value && is_simd_vector<T>::value, T> iif(
  22981. const Mask &condition, const T &trueValue, const T &falseValue)
  22982. {
  22983. T result(falseValue);
  22984. Vc::where(condition) | result = trueValue;
  22985. return result;
  22986. }
  22987. template <typename Mask, typename T>
  22988. enable_if<is_simd_mask<Mask>::value && !is_simd_vector<T>::value, T> iif(
  22989. const Mask &, const T &, const T &) = delete;
  22990. template<typename T> constexpr T iif (bool condition, const T &trueValue, const T &falseValue)
  22991. {
  22992. return condition ? trueValue : falseValue;
  22993. }
  22994. }
  22995. #endif
  22996. #ifndef Vc_NO_STD_FUNCTIONS
  22997. namespace std
  22998. {
  22999. using Vc::min;
  23000. using Vc::max;
  23001. using Vc::abs;
  23002. using Vc::asin;
  23003. using Vc::atan;
  23004. using Vc::atan2;
  23005. using Vc::ceil;
  23006. using Vc::cos;
  23007. using Vc::exp;
  23008. using Vc::fma;
  23009. using Vc::trunc;
  23010. using Vc::floor;
  23011. using Vc::frexp;
  23012. using Vc::ldexp;
  23013. using Vc::log;
  23014. using Vc::log10;
  23015. using Vc::log2;
  23016. using Vc::round;
  23017. using Vc::sin;
  23018. using Vc::sqrt;
  23019. using Vc::isfinite;
  23020. using Vc::isnan;
  23021. }
  23022. #endif
  23023. Vc_RESET_DIAGNOSTICS
  23024. #endif